# Train Models and Choose The Best One!

### Import Basic Libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

### Import The Data

In [2]:
data_path = r"data/cleaned_students.csv"
df = pd.read_csv(data_path)

In [3]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,B,bachelor,standard,none,72,72,74
1,female,C,some_college,standard,completed,69,90,88
2,female,B,masters,standard,none,90,95,93
3,male,A,associates,free/reduced,none,47,57,44
4,male,C,some_college,standard,none,76,78,75


In [4]:
df.columns

Index(['gender', 'race ethnicity group', 'parental level of education',
       'lunch', 'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [5]:
# seperate the input feature and target 
X = df.drop(columns = 'math score')
y = df['math score']

In [6]:
X.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,reading score,writing score
0,female,B,bachelor,standard,none,72,74
1,female,C,some_college,standard,completed,90,88
2,female,B,masters,standard,none,95,93
3,male,A,associates,free/reduced,none,57,44
4,male,C,some_college,standard,none,78,75


In [7]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

## Importing Models

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [10]:
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder , StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
print("Categories in 'gender' variable:     ", end = " " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  " , end = " ")
print(df['race ethnicity group'].unique())

print("Categories in'parental level of education' variable:" , end = " " )
print(df['parental level of education'].unique())

print("Categories in 'lunch' variable:     " , end = " " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     " , end = " " )
print(df['test preparation course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['B' 'C' 'A' 'D' 'E']
Categories in'parental level of education' variable: ['bachelor' 'some_college' 'masters' 'associates' 'high_school'
 'some_high_school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [12]:
X.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,reading score,writing score
0,female,B,bachelor,standard,none,72,74
1,female,C,some_college,standard,completed,90,88
2,female,B,masters,standard,none,95,93
3,male,A,associates,free/reduced,none,57,44
4,male,C,some_college,standard,none,78,75


In [13]:
X.columns

Index(['gender', 'race ethnicity group', 'parental level of education',
       'lunch', 'test preparation course', 'reading score', 'writing score'],
      dtype='object')

In [27]:
# Define and evaluation function 
def evaluate_model(y_true , y_pred): 
    mae = mean_absolute_error(y_true , y_pred)
    rmse = np.sqrt(mean_squared_error(y_true , y_pred))
    r2 = r2_score(y_true , y_pred) 
    return mae , rmse , r2

In [15]:
# seperate the numerical and categorical columns 
numerical_features = ['reading score' , 'writing score']
categorical_features = ['gender', 'race ethnicity group', 'lunch', 'test preparation course']

In [16]:
# create the preprocessing pipeline 
numerical_pipeline = Pipeline(steps = [
    # we need to things: 1. Fill the missing values(imputer) and 2.Standard Scaler to centre the values
    ('imputer' , SimpleImputer(strategy = 'mean')), 
    ('scaler' , StandardScaler())
])

In [17]:
# create the categorical pipeline (nominal)
nominal_pipeline = Pipeline(steps = [
    # 1. Imputer , 2.Encoder
    ('imputer' , SimpleImputer(strategy = 'most_frequent')), 
    ('onehot' , OneHotEncoder(handle_unknown = 'ignore' , drop = 'first'))
])

In [18]:
df['parental level of education'].unique()

array(['bachelor', 'some_college', 'masters', 'associates', 'high_school',
       'some_high_school'], dtype=object)

In [28]:
education_order = [[
    'some_high_school', 
    'high_school',      
    'some_college',     
    'associates',       
    'bachelor',         
    'masters'   
]]

In [20]:
# create the categorical pipeline(ordinal)
ordinal_pipeline = Pipeline(steps = [
    # 1. imputer 2.encoder
    ('imputer' , SimpleImputer(strategy = 'most_frequent')), 
    ('ordinal' , OrdinalEncoder(categories = [education_order]))
])

In [21]:
ordinal_features = ['parental level of education']

In [26]:
# make the preprocessor 
preprocessor = ColumnTransformer(transformers = [
    ('num' , numerical_pipeline , numerical_features), 
    ('nom' , nominal_pipeline , categorical_features), 
    ('ord' , ordinal_pipeline , ordinal_features)
])

In [23]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [24]:
# do the train test split 
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

In [25]:
# function to print evaluations 
def print_model_evaluation(y_true , y_pred): 
    mae , rmse , r2 = evaluate_model(y_true , y_pred)

    print("MAE:" , mae)
    print("RMSE:" , rmse)
    print("R2 score:" , r2)

In [32]:
# train each model and evaluate 
model_list = []
r2_list = [] 

for name , model in models.items(): 
    # make the final pipeline 
    full_pipeline = Pipeline(steps = [
        # processor and trainer 
        ('preprocessor' , preprocessor), 
        ('model' , model)
    ])

    # train the model 
    full_pipeline.fit(X_train , y_train)
    # predict on both train and test data 
    y_pred_train = full_pipeline.predict(X_train)
    y_pred_test = full_pipeline.predict(X_test)

    print(f"Model name: {name}")
    # evaluate the model
    print("On training data")
    print_model_evaluation(y_train , y_pred_train)
    print('-' * 30) 
    print("On test data")
    print_model_evaluation(y_test , y_pred_test)
    print('=' * 30)

    model_list.append(name)
    r2 = r2_score(y_pred_test , y_test)
    r2_list.append(r2)

Model name: Linear Regression
On training data
MAE: 4.280355358943515
RMSE: 5.337022942627443
R2 score: 0.8736565467932157
------------------------------
On test data
MAE: 4.1819664183215135
RMSE: 5.368524617046373
R2 score: 0.8815597679452445
Model name: Lasso
On training data
MAE: 5.205260274468427
RMSE: 6.592500298650881
R2 score: 0.8072231322208645
------------------------------
On test data
MAE: 5.155701094273797
RMSE: 6.517328221922997
R2 score: 0.8254465092551198
Model name: Ridge
On training data
MAE: 4.279096664391439
RMSE: 5.337451682450269
R2 score: 0.8736362468446912
------------------------------
On test data
MAE: 4.18306551577803
RMSE: 5.368407452127496
R2 score: 0.8815649376668129
Model name: K-Neighbors Regressor
On training data
MAE: 4.53075
RMSE: 5.654162183736862
R2 score: 0.8581951363659792
------------------------------
On test data
MAE: 5.811
RMSE: 7.477446088070445
R2 score: 0.7702286198972296
Model name: Decision Tree
On training data
MAE: 0.01875
RMSE: 0.279508

In [33]:
performance = pd.DataFrame(data = {"Model" : model_list , 
                                   "r2 score" : r2_list
                                  })

In [35]:
performance = performance.sort_values(by = 'r2 score' , ascending = False)
performance

Unnamed: 0,Model,r2 score
0,Linear Regression,0.868134
2,Ridge,0.867493
7,CatBoosting Regressor,0.811997
5,Random Forest Regressor,0.810625
8,AdaBoost Regressor,0.801435
6,XGBRegressor,0.775552
1,Lasso,0.72816
4,Decision Tree,0.695384
3,K-Neighbors Regressor,0.622553


## Doing the same things but this time Parent Education will be encode as Nominal

In [37]:
# seperate the numerical and categorical columns 
numerical_features = ['reading score' , 'writing score']
categorical_features = ['gender', 'race ethnicity group', 'lunch', 'test preparation course' , 'parental level of education']

In [38]:
# create the preprocessing pipeline 
numerical_pipeline = Pipeline(steps = [
    # we need to things: 1. Fill the missing values(imputer) and 2.Standard Scaler to centre the values
    ('imputer' , SimpleImputer(strategy = 'mean')), 
    ('scaler' , StandardScaler())
])

In [39]:
# create the categorical pipeline (nominal)
nominal_pipeline = Pipeline(steps = [
    # 1. Imputer , 2.Encoder
    ('imputer' , SimpleImputer(strategy = 'most_frequent')), 
    ('onehot' , OneHotEncoder(handle_unknown = 'ignore' , drop = 'first'))
])

In [40]:
# make the preprocessor 
preprocessor = ColumnTransformer(transformers = [
    ('num' , numerical_pipeline , numerical_features), 
    ('nom' , nominal_pipeline , categorical_features), 
])

In [41]:
# train each model and evaluate 
model_list = []
r2_list = [] 

for name , model in models.items(): 
    # make the final pipeline 
    full_pipeline = Pipeline(steps = [
        # processor and trainer 
        ('preprocessor' , preprocessor), 
        ('model' , model)
    ])

    # train the model 
    full_pipeline.fit(X_train , y_train)
    # predict on both train and test data 
    y_pred_train = full_pipeline.predict(X_train)
    y_pred_test = full_pipeline.predict(X_test)

    print(f"Model name: {name}")
    # evaluate the model
    print("On training data")
    print_model_evaluation(y_train , y_pred_train)
    print('-' * 30) 
    print("On test data")
    print_model_evaluation(y_test , y_pred_test)
    print('=' * 30)

    model_list.append(name)
    r2 = r2_score(y_pred_test , y_test)
    r2_list.append(r2)

Model name: Linear Regression
On training data
MAE: 4.266711846071957
RMSE: 5.323050852720514
R2 score: 0.8743172040139593
------------------------------
On test data
MAE: 4.214763142474852
RMSE: 5.393993869732845
R2 score: 0.8804332983749564
Model name: Lasso
On training data
MAE: 5.205260274468427
RMSE: 6.592500298650881
R2 score: 0.8072231322208645
------------------------------
On test data
MAE: 5.155701094273797
RMSE: 6.517328221922997
R2 score: 0.8254465092551198
Model name: Ridge
On training data
MAE: 4.26500882127927
RMSE: 5.323498366558606
R2 score: 0.8742960705864395
------------------------------
On test data
MAE: 4.212526820759747
RMSE: 5.393615342000716
R2 score: 0.8804500791594524
Model name: K-Neighbors Regressor
On training data
MAE: 4.4595
RMSE: 5.571929647797072
R2 score: 0.8622898815176547
------------------------------
On test data
MAE: 5.705
RMSE: 7.362105677046479
R2 score: 0.7772624460422194
Model name: Decision Tree
On training data
MAE: 0.01875
RMSE: 0.27950849

In [42]:
performance = pd.DataFrame(data = {"Model" : model_list , 
                                   "r2 score" : r2_list
                                  })

In [43]:
performance = performance.sort_values(by = 'r2 score' , ascending = False)
performance

Unnamed: 0,Model,r2 score
0,Linear Regression,0.867044
2,Ridge,0.866414
5,Random Forest Regressor,0.826017
8,AdaBoost Regressor,0.813247
7,CatBoosting Regressor,0.810315
6,XGBRegressor,0.783747
1,Lasso,0.72816
4,Decision Tree,0.707487
3,K-Neighbors Regressor,0.63416
