# Training Multiple Models on the dataset to find the best Model

## Load the necessary libraries

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

#models
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV,ElasticNetCV
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor

# Metrices
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold,cross_val_score

## Load the Dataset

In [30]:
dataset=pd.read_csv('data/student_data.csv')

In [31]:
dataset

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## Check if the data containes any null/duplicates

In [32]:
dataset[dataset.isna().any(axis=1)]

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score


In [33]:
dataset.duplicated().sum()

np.int64(0)

## Splitting Dependent and independent variables into X,y

In this math_score is dependent feature(output feature) and all other are Independent Feature

In [34]:
dataset

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [35]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [36]:
X=dataset.drop('math_score',axis=1)
y=dataset['math_score']

In [37]:
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [38]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

## Performing train test split

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=4,test_size=0.3)

In [40]:
X_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
715,female,group B,associate's degree,free/reduced,completed,94,87
920,male,group D,high school,free/reduced,none,70,67
295,male,group B,associate's degree,free/reduced,none,62,60
83,male,group E,associate's degree,standard,none,64,63
942,male,group C,high school,standard,none,66,64
...,...,...,...,...,...,...,...
897,female,group B,some high school,free/reduced,completed,78,79
709,female,group D,associate's degree,free/reduced,completed,61,58
439,male,group D,some high school,standard,completed,88,82
174,female,group C,bachelor's degree,free/reduced,completed,51,54


In [41]:
y_test

698    57
577    81
763    62
790    48
520    71
       ..
53     88
282    73
656    77
995    88
50     53
Name: math_score, Length: 300, dtype: int64

## Now We will Encode,scale the data using columntransformer

### Checkout the categorical and numerical feature

In [42]:
categorical_features=X_train.select_dtypes(include='O').columns
numerical_features=X_train.select_dtypes(exclude='O').columns

In [43]:
categorical_features

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')

In [44]:
numerical_features

Index(['reading_score', 'writing_score'], dtype='object')

In [45]:
ohe=OneHotEncoder(drop='first')
scaler=StandardScaler()

In [46]:
transformer=ColumnTransformer([
    ('encoder',ohe,categorical_features),
    ('scaler',scaler,numerical_features)
])

In [47]:
X_train_transformed=transformer.fit_transform(X_train)

In [48]:
X_train_transformed

array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
         1.74080103,  1.27630782],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.06580348, -0.07356825],
       [ 1.        ,  1.        ,  0.        , ...,  1.        ,
        -0.49252904, -0.54602487],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.32205164,  0.9388388 ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
        -1.26023625, -0.95098768],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         1.67100946,  1.68127063]], shape=(700, 14))

In [49]:
X_test_transformed=transformer.transform(X_test)

In [50]:
X_test_transformed

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.62413599,  0.73635739],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         1.53142633,  1.41129542],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
        -0.49252904, -0.34354346],
       ...,
       [ 1.        ,  0.        ,  1.        , ...,  1.        ,
        -0.14357122, -0.27604965],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         2.08975885,  1.81625824],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.98106999, -1.3559505 ]], shape=(300, 14))

## Training Multiple Model And Evaluation

### Model Evaluation

In [None]:
def evaluate(y_predicted,y_true):
    r2=r2_score(y_pred=y_predicted,y_true=y_true)
    mse=mean_squared_error(y_pred=y_predicted,y_true=y_true)
    mae=mean_absolute_error(y_pred=y_predicted,y_true=y_true)

    return (r2,mse,mae)

### training Multiple Models

In [71]:
fold=KFold(n_splits=5,shuffle=True,random_state=42)

In [104]:
models_evalute={}

In [105]:
def train_models(models):
    for model in models:
        print(f'======================= {model} ==============================')
        # training the model
        model.fit(X_train_transformed,y_train)
        # testing the model
        y_predicted=model.predict(X_test_transformed)
        
        r2,mse,mae=evaluate(y_predicted=y_predicted,y_true=y_test)
        models_evalute[model]=r2
        scores=cross_val_score(model,X_train_transformed,y_train,cv=fold,scoring='r2')
        print(f'r2 Score: {r2}')
        print(f'mean squared error: {mse}')
        print(f'mean absolute error: {mae}')
        print(f"Cross Val Score: {scores}")


In [106]:
# Model Objects
linear_regression=LinearRegression()
lasso=LassoCV(cv=5,random_state=4)
ridge=RidgeCV(cv=5)
elasticnet=ElasticNetCV(cv=5,random_state=42)
svr=SVR()
decisiontree=DecisionTreeRegressor()
adaboost=AdaBoostRegressor(random_state=42)
gradientboost=GradientBoostingRegressor()
xgboost=XGBRegressor()
catboost=CatBoostRegressor()



In [110]:
models=[linear_regression,lasso,ridge,elasticnet,svr,decisiontree,adaboost,gradientboost,xgboost]

In [111]:
train_models(models=models)

r2 Score: 0.86721653131702
mean squared error: 32.10404919801169
mean absolute error: 4.593828785908587
Cross Val Score: [0.83946718 0.89033229 0.88177183 0.87932425 0.86902046]
r2 Score: 0.8670428221375347
mean squared error: 32.1460481614343
mean absolute error: 4.585277707322263
Cross Val Score: [0.83712195 0.89036634 0.88195764 0.87906657 0.87108627]
r2 Score: 0.867049196652277
mean squared error: 32.14450694747952
mean absolute error: 4.592022186478344
Cross Val Score: [0.83940409 0.89053629 0.88167542 0.87922124 0.8694556 ]
r2 Score: 0.8651342167101918
mean squared error: 32.607505925316794
mean absolute error: 4.586880291920508
Cross Val Score: [0.83700862 0.89058834 0.88064931 0.87803567 0.87069855]
r2 Score: 0.7637264865364295
mean squared error: 57.12560890039336
mean absolute error: 5.587859877653411
Cross Val Score: [0.74146888 0.75593726 0.72934005 0.67753471 0.7648554 ]
r2 Score: 0.7025915154022953
mean squared error: 71.90666666666667
mean absolute error: 6.6466666666666

In [115]:
evaluation = pd.DataFrame(models_evalute.items(), columns=["Model", "Score"])

In [116]:
evaluation

Unnamed: 0,Model,Score
0,LinearRegression(),0.867217
1,"LassoCV(cv=5, random_state=4)",0.867043
2,RidgeCV(cv=5),0.867049
3,"ElasticNetCV(cv=5, random_state=42)",0.865134
4,SVR(),0.763726
5,DecisionTreeRegressor(),0.702592
6,"(DecisionTreeRegressor(max_depth=3, random_sta...",0.823892
7,([DecisionTreeRegressor(criterion='friedman_ms...,0.863193
8,"XGBRegressor(base_score=None, booster=None, ca...",0.807597


# By this Linear regression is working Well for this problem Statement