# Training the model

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
#from xgboost import XGBRegressor
import warnings



In [5]:
df = pd.read_csv('data/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [12]:
x = df.drop(columns=['math score'], axis=1)
y = df['math score']

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

num_t = StandardScaler()
oh = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh, cat_features),
        ('StandardScaler', num_t, num_features),
    ]
)

In [14]:
x = preprocessor.fit_transform(x)

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [16]:
def evaluate(true, predicted):
    mse = mean_squared_error(true, predicted)
    r2 = r2_score(true, predicted)
    return mse, r2

In [18]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=False),
    'AdaBoostRegressor': AdaBoostRegressor()
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    train_mse, train_r2 = evaluate(y_train, y_train_pred)
    test_mse, test_r2 = evaluate(y_test, y_test_pred)

    print(list(models.keys())[i])
    print('Model performance for train data:')
    print(f'MSE: {train_mse}')
    print(f'R2 score: {train_r2}')
    print('--------------')
    print('Model performance for test data:')
    print(f'MSE: {test_mse}')
    print(f'R2 score: {test_r2}')
    print('--------------')
    print('--------------')



LinearRegression
Model performance for train data:
MSE: 28.71723090277778
R2 score: 0.8702651718123205
--------------
Model performance for test data:
MSE: 26.524375
R2 score: 0.9114754987442741
--------------
--------------
Lasso
Model performance for train data:
MSE: 43.82489472415989
R2 score: 0.8020138081338455
--------------
Model performance for test data:
MSE: 41.3245100596335
R2 score: 0.8620803829433763
--------------
--------------
Ridge
Model performance for train data:
MSE: 28.55820416692007
R2 score: 0.8709836013267677
--------------
Model performance for test data:
MSE: 27.064733901538162
R2 score: 0.9096720631399382
--------------
--------------
KNeighborsRegressor
Model performance for train data:
MSE: 33.03982222222222
R2 score: 0.8507371524133668
--------------
Model performance for test data:
MSE: 61.90160000000001
R2 score: 0.7934048109736256
--------------
--------------
DecisionTreeRegressor
Model performance for train data:
MSE: 0.18055555555555555
R2 score: 0.99