# Training Student Examination Score Prediction Model

## Data Exploration and Pre-Processing

In [127]:
# importing pandas for data exploration
import pandas as pd

In [128]:
# loading the dataset into a pandas dataframe
data = pd.read_csv('student_habits_performance.csv')
df = pd.DataFrame(data)

In [129]:
# observing the top 5 data points
df.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [130]:
# statistics of the dataset
df.describe()

Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,attendance_percentage,sleep_hours,exercise_frequency,mental_health_rating,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,3.5501,2.5055,1.8197,84.1317,6.4701,3.042,5.438,69.6015
std,2.3081,1.46889,1.172422,1.075118,9.399246,1.226377,2.025423,2.847501,16.888564
min,17.0,0.0,0.0,0.0,56.0,3.2,0.0,1.0,18.4
25%,18.75,2.6,1.7,1.0,78.0,5.6,1.0,3.0,58.475
50%,20.0,3.5,2.5,1.8,84.4,6.5,3.0,5.0,70.5
75%,23.0,4.5,3.3,2.525,91.025,7.3,5.0,8.0,81.325
max,24.0,8.3,7.2,5.4,100.0,10.0,6.0,10.0,100.0


In [131]:
# observing the data types of each column
df.dtypes

student_id                        object
age                                int64
gender                            object
study_hours_per_day              float64
social_media_hours               float64
netflix_hours                    float64
part_time_job                     object
attendance_percentage            float64
sleep_hours                      float64
diet_quality                      object
exercise_frequency                 int64
parental_education_level          object
internet_quality                  object
mental_health_rating               int64
extracurricular_participation     object
exam_score                       float64
dtype: object

In [132]:
# observing the columns in the df
df.columns

Index(['student_id', 'age', 'gender', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'exam_score'],
      dtype='object')

In [133]:
# observing top 3 data points
df.head(3)

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3


In [134]:
# observing the total null values in the df
df.isnull().sum()

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64

In [135]:
# transforming gender object type data into numerical values
df['gender'] = pd.to_numeric(df['gender'].replace({'Female' : 0, 'Male' : 1, 'Other' : 0.5}, regex=True), errors='coerce')
df['gender'].dtype

  df['gender'] = pd.to_numeric(df['gender'].replace({'Female' : 0, 'Male' : 1, 'Other' : 0.5}, regex=True), errors='coerce')


dtype('float64')

In [136]:
# transforming part_time_job object type data into numerical values
df['part_time_job'] = pd.to_numeric(df['part_time_job'].replace({'No' : 0, 'Yes' : 1}, regex=True), errors='coerce')
df['part_time_job'].dtype

  df['part_time_job'] = pd.to_numeric(df['part_time_job'].replace({'No' : 0, 'Yes' : 1}, regex=True), errors='coerce')


dtype('int64')

In [137]:
# transforming part_time_job object type data into numerical values
df['diet_quality'] = pd.to_numeric(df['diet_quality'].replace({'Poor' : 0, 'Fair' : 1, 'Good' : 2}, regex=True), errors='coerce')
df['diet_quality'].dtype

  df['diet_quality'] = pd.to_numeric(df['diet_quality'].replace({'Poor' : 0, 'Fair' : 1, 'Good' : 2}, regex=True), errors='coerce')


dtype('int64')

In [138]:
# transforming parental_education_level object type data into numerical values
df['parental_education_level'] = pd.to_numeric(df['parental_education_level'].replace({None : 0, 'High School' : 1, 'Bachelor' : 2, 'Master' : 3}, regex=True), errors='coerce')
df['parental_education_level'].dtype

  df['parental_education_level'] = pd.to_numeric(df['parental_education_level'].replace({None : 0, 'High School' : 1, 'Bachelor' : 2, 'Master' : 3}, regex=True), errors='coerce')


dtype('int64')

In [139]:
# transforming internet_quality object type data into numerical values
df['internet_quality'] = pd.to_numeric(df['internet_quality'].replace({'Poor' : 0, 'Average' : 1, 'Good' : 2}, regex=True), errors='coerce')
df['internet_quality'].dtype

  df['internet_quality'] = pd.to_numeric(df['internet_quality'].replace({'Poor' : 0, 'Average' : 1, 'Good' : 2}, regex=True), errors='coerce')


dtype('int64')

In [140]:
# transforming extracurricular_participation object type data into numerical values
df['extracurricular_participation'] = pd.to_numeric(df['extracurricular_participation'].replace({'No' : 0, 'Yes' : 1}, regex=True), errors='coerce')
df['extracurricular_participation'].dtype

  df['extracurricular_participation'] = pd.to_numeric(df['extracurricular_participation'].replace({'No' : 0, 'Yes' : 1}, regex=True), errors='coerce')


dtype('int64')

In [141]:
# re-checking the data types of the columns after transformation
df.dtypes

student_id                        object
age                                int64
gender                           float64
study_hours_per_day              float64
social_media_hours               float64
netflix_hours                    float64
part_time_job                      int64
attendance_percentage            float64
sleep_hours                      float64
diet_quality                       int64
exercise_frequency                 int64
parental_education_level           int64
internet_quality                   int64
mental_health_rating               int64
extracurricular_participation      int64
exam_score                       float64
dtype: object

In [142]:
# re-checking the total null values in the df after transformation
df.isnull().sum()

student_id                       0
age                              0
gender                           0
study_hours_per_day              0
social_media_hours               0
netflix_hours                    0
part_time_job                    0
attendance_percentage            0
sleep_hours                      0
diet_quality                     0
exercise_frequency               0
parental_education_level         0
internet_quality                 0
mental_health_rating             0
extracurricular_participation    0
exam_score                       0
dtype: int64

In [143]:
# setting target record for prediction and features for model training
y = df['exam_score']
features = ['age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours', 'part_time_job', 'attendance_percentage', 'sleep_hours', 'diet_quality', 'exercise_frequency', 'parental_education_level', 'internet_quality', 'mental_health_rating', 'extracurricular_participation']
X = df[features]

In [144]:
# for splitting dataset for training and validation
from sklearn.model_selection import train_test_split

In [145]:
# separating the dataset into training and validation
train_X, val_x, train_y, val_y = train_test_split(X, y, random_state=0)

In [146]:
from sklearn.ensemble import RandomForestRegressor

In [147]:
# creating an instance of the regressor model
model = RandomForestRegressor()

In [148]:
# training the regression model
model.fit(train_X, train_y)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [149]:
# saving the predicted exam scores in a variable
exam_prediction_scores = model.predict(val_x)

In [150]:
# for evaluating model accuracy
from sklearn.metrics import mean_absolute_error

In [151]:
# model accuracy
mae = mean_absolute_error(val_y, exam_prediction_scores)
print(mae)

4.643156000000004


In [153]:
# essential for saving trained model
import joblib

In [155]:
# saving the model
joblib.dump(model, 'model.pkl')

['model.pkl']