In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split



In [3]:
df = pd.read_csv('StudentPerformanceFactors.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [4]:
x = df.drop(columns = ['Exam_Score'], axis = 1)
y =  df['Exam_Score']

Column Transformation and Preprocesssing

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [6]:

categorical_features = x.select_dtypes(include = 'object').columns
numerical_features =  x.select_dtypes(include = ['int64', 'float64']).columns

In [7]:
num_trans =  StandardScaler()
OH_trans =  OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncodeR",OH_trans, categorical_features ),
        ("StandardScaler", num_trans, numerical_features),
    ]
)

x_transformed= preprocessor.fit_transform(x)

#print(x_transformed)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_transformed, y , test_size=0.33, random_state=42)

Evaluation Function for all the metrics after training 

In [9]:
def evaluate_model(true,predicted):
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    mae = mean_absolute_error(true, predicted)
    mse =  mean_squared_error(true, predicted)
    r2_square=  r2_score(true, predicted)
    return mae, mse, r2_square

In [14]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor

models= {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=0),
}
model_list = []
r2_list = []


for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #make predictions
    y_train_pred = model.predict(X_train)
    y_test_predict = model.predict(X_test)


    #Evaluating test and train datasets

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2c= evaluate_model(y_test, y_test_predict)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Performance For Training Set')
    print('-Root Mean Squared Error:{:4f}' .format(model_train_rmse))
    print('-Mean Absolute Error: {:4f}' .format(model_train_mae))
    print('-R2 Score:{:4f}' .format(model_test_r2c))
    print('-------------------------------')


    print('Model Performance For Test set')
    print('-Root Mean Squared Error:{:4f}'.format(model_test_rmse))
    print('-Mean Absolute Error: {:4f}'.format(model_test_mae))
    print('-R2 Score{:4f}'.format(model_test_r2c))

    r2_list.append(model_test_r2c)

    print ('='*35)
    print('\n')


Linear Regression
Model Performance For Training Set
-Root Mean Squared Error:4.572144
-Mean Absolute Error: 0.511988
-R2 Score:0.764953
-------------------------------
Model Performance For Test set
-Root Mean Squared Error:3.242048
-Mean Absolute Error: 0.466427
-R2 Score0.764953


Lasso
Model Performance For Training Set
-Root Mean Squared Error:9.533494
-Mean Absolute Error: 1.959692
-R2 Score:0.438806
-------------------------------
Model Performance For Test set
-Root Mean Squared Error:7.740657
-Mean Absolute Error: 1.906200
-R2 Score0.438806


Ridge
Model Performance For Training Set
-Root Mean Squared Error:4.571449
-Mean Absolute Error: 0.508944
-R2 Score:0.765338
-------------------------------
Model Performance For Test set
-Root Mean Squared Error:3.236736
-Mean Absolute Error: 0.462250
-R2 Score0.765338


ElasticNet
Model Performance For Training Set
-Root Mean Squared Error:9.297414
-Mean Absolute Error: 1.918134
-R2 Score:0.457012
-------------------------------
Model P