In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv(r"C:\Users\Admin\Desktop\python\Exam_score_predictor_using_ML\Data\cleaned_student_data.csv")
df.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [4]:
print(f"size: {df.size}")
print(df.isnull().sum())

size: 14544
student_id                       0
age                              0
gender                           0
study_hours_per_day              0
social_media_hours               0
netflix_hours                    0
part_time_job                    0
attendance_percentage            0
sleep_hours                      0
diet_quality                     0
exercise_frequency               0
parental_education_level         0
internet_quality                 0
mental_health_rating             0
extracurricular_participation    0
exam_score                       0
dtype: int64


In [38]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [6]:
df.columns

Index(['student_id', 'age', 'gender', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'exam_score'],
      dtype='object')

In [7]:
df.head(2)

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0


In [8]:
features = ['study_hours_per_day','attendance_percentage', 'sleep_hours','mental_health_rating','part_time_job']

In [9]:
target = "exam_score"

In [10]:
df_model = df[features + [target]].copy()

In [11]:
df_model

Unnamed: 0,study_hours_per_day,attendance_percentage,sleep_hours,mental_health_rating,part_time_job,exam_score
0,0.0,85.0,8.0,8,No,56.2
1,6.9,97.3,4.6,8,No,100.0
2,1.4,94.8,8.0,1,No,34.3
3,1.0,71.0,9.2,1,No,26.8
4,5.0,90.9,4.9,1,No,66.4
...,...,...,...,...,...,...
904,2.6,77.0,7.5,6,No,76.1
905,2.9,86.0,6.8,6,Yes,65.9
906,3.0,61.9,6.5,9,No,64.4
907,5.4,100.0,7.6,1,Yes,69.7


In [12]:
le = LabelEncoder()

In [13]:
df_model['part_time_job'] =  le.fit_transform(df_model['part_time_job'])

In [14]:
df_model

Unnamed: 0,study_hours_per_day,attendance_percentage,sleep_hours,mental_health_rating,part_time_job,exam_score
0,0.0,85.0,8.0,8,0,56.2
1,6.9,97.3,4.6,8,0,100.0
2,1.4,94.8,8.0,1,0,34.3
3,1.0,71.0,9.2,1,0,26.8
4,5.0,90.9,4.9,1,0,66.4
...,...,...,...,...,...,...
904,2.6,77.0,7.5,6,0,76.1
905,2.9,86.0,6.8,6,1,65.9
906,3.0,61.9,6.5,9,0,64.4
907,5.4,100.0,7.6,1,1,69.7


In [15]:
X = df_model[features]
y = df_model[target]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
len(y_test)

182

In [18]:
len(y_train)

727

In [40]:
models = {
    "LinearRegression":{
        "model" : LinearRegression(),
        "params" : {}
    },
    "DecisionTree":{
        "model" : DecisionTreeRegressor(),
        "params" : {"max_depth" : [3,5,10], "min_samples_split" : [2,5]}
    },
    "RandomForest" : {
        "model" : RandomForestRegressor(),
        "params" : {"n_estimators" : [50,100], "max_depth": [5,10]}
    }
}

In [42]:
best_models = []

In [44]:
for name, config in models.items():
    print(f"Training {name}")

    grid = GridSearchCV(config["model"],config["params"],cv = 5, scoring = "neg_mean_squared_error")
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    best_models.append({
        "model" : name,
        "best_params" : grid.best_params_,
        "rmse" : rmse,
        "r2" : r2
    })

Training LinearRegression
Training DecisionTree
Training RandomForest


In [46]:
best_models

[{'model': 'LinearRegression',
  'best_params': {},
  'rmse': 7.376812995425731,
  'r2': 0.7968894383350734},
 {'model': 'DecisionTree',
  'best_params': {'max_depth': 5, 'min_samples_split': 5},
  'rmse': 9.189274088907542,
  'r2': 0.6848209299232596},
 {'model': 'RandomForest',
  'best_params': {'max_depth': 5, 'n_estimators': 100},
  'rmse': 7.999015848171348,
  'r2': 0.761181441064197}]

In [50]:
results_df = pd.DataFrame(best_models)

In [52]:
results_df.sort_values(by="rmse")

Unnamed: 0,model,best_params,rmse,r2
0,LinearRegression,{},7.376813,0.796889
2,RandomForest,"{'max_depth': 5, 'n_estimators': 100}",7.999016,0.761181
1,DecisionTree,"{'max_depth': 5, 'min_samples_split': 5}",9.189274,0.684821


In [54]:
import joblib

best_row = results_df.sort_values(by="rmse").iloc[0]


In [56]:
best_row

model          LinearRegression
best_params                  {}
rmse                   7.376813
r2                     0.796889
Name: 0, dtype: object

In [68]:
best_model_name = best_row["model"]

In [70]:
best_model_name

'LinearRegression'

In [72]:
best_model_config = models[best_model_name]

In [74]:
best_model_config

{'model': LinearRegression(), 'params': {}}

In [76]:
final_model = best_model_config["model"]

In [78]:
final_model.fit(X,y)

In [82]:
joblib.dump(final_model, "best_model_pkl")

['best_model_pkl']

In [84]:
joblib.load("best_model_pkl").predict(X_test)

array([ 69.65327425, 101.49785175,  57.67033365,  61.28183191,
        63.56874464,  54.1809859 , 102.62470515,  41.0826638 ,
        57.71722366,  77.22484193, 115.32425793,  67.24642978,
        57.96780871,  37.5268067 ,  52.76981731,  80.08715057,
        44.07954472,  85.58995874,  66.75598134,  80.04701348,
        60.70117069,  69.88236463,  67.3409568 ,  52.04571946,
        77.0077602 ,  70.5947591 ,  86.38678834,  67.33884728,
        56.84795774,  69.95368226,  72.4395309 ,  71.62634586,
        42.12695617,  53.13689169,  62.0019684 ,  73.3095103 ,
        69.29900088,  77.48107335,  94.26716699,  46.42355805,
        81.85562938,  40.60113112,  70.1122084 ,  85.07548216,
        94.48143692,  74.26344109,  66.70036983,  48.92767151,
        54.97205488,  47.21912588,  69.51599848,  71.14315062,
        91.52584331,  88.32100125,  84.45415703,  63.64137371,
        83.02517528,  63.08783432,  98.07031515,  68.31359587,
        61.0234794 ,  81.1148883 ,  77.45864808,  62.14