In [52]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# For evaluation
from sklearn.metrics import mean_squared_error

# Import keras components
from tensorflow import keras
from keras import layers, callbacks
from joblib import dump, load

In [3]:
all_train = pd.read_csv('kbtu-data-science-challenge-2025-entry-task-new/train.csv')
real_test = pd.read_csv('kbtu-data-science-challenge-2025-entry-task-new/test.csv')

all_train.head()

Unnamed: 0,student_id,gender,age,study_hours,attendance_rate,previous_scores,parental_education,school_type,extracurricular,final_math_score
0,1001,Male,17,9.1,68.7,70.0,Bachelor’s,Private,3,86.2
1,1002,Female,17,10.5,66.6,84.8,Bachelor’s,Public,3,90.6
2,1003,Male,17,17.4,58.8,73.8,High School,Private,3,94.1
3,1004,Male,17,8.1,80.4,45.0,High School,Public,1,82.9
4,1005,Male,18,17.7,73.8,51.1,Master’s,Public,2,98.6


In [4]:
# Preprocessor setup

target = 'final_math_score'
features = ['gender', 'age', 'study_hours', 'attendance_rate', 
            'previous_scores', 'parental_education', 'school_type', 'extracurricular']

num_cols = ['age', 'study_hours', 'attendance_rate', 'previous_scores', 'extracurricular']
cat_cols = ['gender', 'parental_education', 'school_type']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [5]:
x = preprocessor.fit_transform(all_train[features])
y = all_train[target]

x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42)

print("Processed training shape:", x_train.shape)

Processed training shape: (3200, 9)


In [69]:
classifier = keras.models.load_model('models/classifier.keras')
regressor = keras.models.load_model('models/regressor.keras')

perfect_preds = classifier.predict(x).flatten()
# print(perfect_preds)

regression_preds = regressor.predict(x).flatten()
# print(regression_preds)

threshold = 0.5
final_predictions = np.where(perfect_preds >= threshold, 100, regression_preds)
final_predictions = np.clip(final_predictions, 0, 100)
print(final_predictions.view())
print(y)

mse = mean_squared_error(y, final_predictions)
print("Validation MSE (Regression):", mse)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[ 79.159195  87.92324   91.283356 ...  90.54684   67.06526  100.      ]
0        86.2
1        90.6
2        94.1
3        82.9
4        98.6
        ...  
3995     78.8
3996     80.5
3997     93.0
3998     70.9
3999    100.0
Name: final_math_score, Length: 4000, dtype: float64
Validation MSE (Regression): 40.107384492522485
