In [13]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# For evaluation
from sklearn.metrics import mean_squared_error

# Import keras components
from tensorflow import keras
from keras import layers, callbacks

In [14]:
all_train = pd.read_csv('kbtu-data-science-challenge-2025-entry-task-new/train.csv')

all_train.head()

Unnamed: 0,student_id,gender,age,study_hours,attendance_rate,previous_scores,parental_education,school_type,extracurricular,final_math_score
0,1001,Male,17,9.1,68.7,70.0,Bachelor’s,Private,3,86.2
1,1002,Female,17,10.5,66.6,84.8,Bachelor’s,Public,3,90.6
2,1003,Male,17,17.4,58.8,73.8,High School,Private,3,94.1
3,1004,Male,17,8.1,80.4,45.0,High School,Public,1,82.9
4,1005,Male,18,17.7,73.8,51.1,Master’s,Public,2,98.6


In [15]:
# Preprocessor setup

target = 'final_math_score'
features = ['gender', 'age', 'study_hours', 'attendance_rate', 
            'previous_scores', 'parental_education', 'school_type', 'extracurricular']

num_cols = ['age', 'study_hours', 'attendance_rate', 'previous_scores', 'extracurricular']
cat_cols = ['gender', 'parental_education', 'school_type']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [16]:
train = all_train[all_train[target] < 100]

x = preprocessor.fit_transform(train[features])
y = train[target]

x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42)

print("Processed training shape:", x_train.shape)

Processed training shape: (2343, 9)


In [21]:
def build_regression_model(input_dim):
    model = keras.Sequential([
        # keras.Input(shape=(input_dim,)),
        layers.Dense(16, activation='relu', input_dim=input_dim),
        # layers.Dropout(0.2),
        layers.Dense(8, activation='relu'),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
    return model

input_dim_reg = x_train.shape[1]
regressor = build_regression_model(input_dim_reg)
regressor.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
def train_regressor(model):
    early_stop_reg = callbacks.EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

    history_reg = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=300,
        batch_size=32,
        callbacks=[early_stop_reg],
        verbose=1
    )
    return (model, history_reg)

train_regressor(regressor)


Epoch 1/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 21.6391 - mse: 21.6391 - val_loss: 21.1605 - val_mse: 21.1605
Epoch 2/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 21.8418 - mse: 21.8418 - val_loss: 20.9685 - val_mse: 20.9685
Epoch 3/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 21.0928 - mse: 21.0928 - val_loss: 21.1908 - val_mse: 21.1908
Epoch 4/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 21.6292 - mse: 21.6292 - val_loss: 20.9960 - val_mse: 20.9960
Epoch 5/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 20.8658 - mse: 20.8658 - val_loss: 21.3302 - val_mse: 21.3302
Epoch 6/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 21.9442 - mse: 21.9442 - val_loss: 21.2081 - val_mse: 21.2081
Epoch 7/300
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

(<Sequential name=sequential_3, built=True>,
 <keras.src.callbacks.history.History at 0x7f015cda0070>)

In [35]:
val_mse_reg = regressor.evaluate(x_val, y_val, verbose=0)[0]
print("Validation MSE (Regression):", val_mse_reg)

Validation MSE (Regression): 20.767602920532227


In [37]:
# regressor.save('models/regressor.keras')

model = keras.models.load_model('models/regressor.keras')

val_mse_reg = model.evaluate(x_val, y_val, verbose=0)[0]
print("Validation MSE (Regression):", val_mse_reg)

Validation MSE (Regression): 20.36351203918457
