In [3]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import r2_score , mean_squared_error , mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV , train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression , Lasso , Ridge
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [4]:
df = pd.read_csv("data/StudentsPerformance.csv")

In [5]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
X = df.drop({"math score"} , axis = 1)

In [7]:
y = df["math score"]

In [8]:
num_features = X.select_dtypes(exclude = "object").columns
cat_features = X.select_dtypes(include = "object").columns

In [9]:
num_features

Index(['reading score', 'writing score'], dtype='object')

In [10]:
cat_features

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course'],
      dtype='object')

In [11]:
# Transforming

from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer([
    ("One Hot Encoder" , oh_transformer , cat_features),
    ("Standard Scaler" , num_transformer , num_features)
]
)

In [12]:
X = preprocessor.fit_transform(X)

In [14]:
X.shape

(1000, 19)

In [15]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [16]:
X_train , X_test , y_train , y_test = train_test_split(X , y , random_state=42 , test_size=0.20)

In [17]:
X_train.shape , X_test.shape

((800, 19), (200, 19))

In [18]:
def evaluate_model(true , predicted):
    mae = mean_absolute_error(true , predicted)
    mse = mean_squared_error(true , predicted)
    r2score = r2_score(true , predicted)
    rmse = np.sqrt(mse)
    return mae  , r2score , rmse

In [19]:
models = {
    "Linear Regresson" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Nearest Neighbours" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(),
    "XGBRegressor" : XGBRegressor(),
    "Cat Boost Regressor" : CatBoostRegressor(),
    "Ada Boost regressor" : AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train , y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_r2_score , model_train_rmse = evaluate_model(y_train , y_train_pred)
    model_test_mae , model_test_r2_score , model_test_rmse = evaluate_model(y_test , y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance Measures for Training Data")
    print("Root Mean Square Error : {}".format(model_train_rmse))
    print("Mean absolute Error : {}".format(model_train_mae))
    print("R2 Score : {}".format(model_train_r2_score))

    print("----------------------------------------------------")

    print("Model Performance Measures for Test Data")
    print("Root Mean Square Error : {}".format(model_test_rmse))
    print("Mean absolute Error : {}".format(model_test_mae))
    print("R2 Score : {}".format(model_test_r2_score))

    r2_list.append(model_test_r2_score)


Linear Regresson
Model Performance Measures for Training Data
Root Mean Square Error : 5.327359474652332
Mean absolute Error : 4.27880859375
R2 Score : 0.8741136595329527
----------------------------------------------------
Model Performance Measures for Test Data
Root Mean Square Error : 5.40960336750764
Mean absolute Error : 4.2259375
R2 Score : 0.8797402769404664
Lasso
Model Performance Measures for Training Data
Root Mean Square Error : 6.593815587795566
Mean absolute Error : 5.206302661246526
R2 Score : 0.8071462015863456
----------------------------------------------------
Model Performance Measures for Test Data
Root Mean Square Error : 6.51969453566742
Mean absolute Error : 5.157881810347763
R2 Score : 0.8253197323627852
Ridge
Model Performance Measures for Training Data
Root Mean Square Error : 5.323324922741654
Mean absolute Error : 4.264987823725981
R2 Score : 0.8743042615212909
----------------------------------------------------
Model Performance Measures for Test Data
Roo

In [20]:
pd.DataFrame(list(zip(model_list , r2_list)) , columns=["model name" , "r2_score"]).sort_values(by=["r2_score"] , ascending=False)

Unnamed: 0,model name,r2_score
2,Ridge,0.880593
0,Linear Regresson,0.87974
5,Random Forest Regressor,0.856165
8,Ada Boost regressor,0.854832
7,Cat Boost Regressor,0.851632
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Nearest Neighbours,0.783898
4,Decision Tree,0.753697
