In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv('data/stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X=df.drop('math_score',axis=1)

X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
y=df['math_score']

In [6]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


numerical_features

['reading_score', 'writing_score']

In [7]:
categorical_features

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [8]:
for col in categorical_features:
    print(f'The number of unique values in {col} is {df[col].nunique()}')
    print(f'The unique values in {col} are {df[col].unique()}')

The number of unique values in gender is 2
The unique values in gender are ['female' 'male']
The number of unique values in race_ethnicity is 5
The unique values in race_ethnicity are ['group B' 'group C' 'group A' 'group D' 'group E']
The number of unique values in parental_level_of_education is 6
The unique values in parental_level_of_education are ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
The number of unique values in lunch is 2
The unique values in lunch are ['standard' 'free/reduced']
The number of unique values in test_preparation_course is 2
The unique values in test_preparation_course are ['none' 'completed']


In [9]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

scaler=StandardScaler()
oneHot=OneHotEncoder()

In [10]:
preprocessor=ColumnTransformer(
    [

        ('OneHotEncoder',oneHot,categorical_features),
        ('StandardScaler',scaler,numerical_features)
    ]
)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
29,female,group D,master's degree,standard,none,70,75
535,female,group C,bachelor's degree,free/reduced,completed,83,83
695,female,group D,some college,free/reduced,none,89,86
557,male,group C,master's degree,free/reduced,none,67,66
836,male,group E,high school,standard,none,64,57
...,...,...,...,...,...,...,...
106,female,group D,master's degree,standard,none,100,100
270,male,group C,bachelor's degree,standard,none,63,61
860,female,group C,associate's degree,standard,none,62,53
435,male,group C,some college,free/reduced,completed,48,53


In [13]:
X_train=preprocessor.fit_transform(X_train)

In [14]:
X_test=preprocessor.transform(X_test)

In [15]:
X_train

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.03079054,  0.43405338],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.9302895 ,  0.96470125],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.34544287,  1.1636942 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.52274728, -1.02522827],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -1.49143847, -1.02522827],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.48382733,  1.36268716]])

In [17]:
X_test

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.13786619,  1.03103224],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.24597837,  0.30139141],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.23836723,  0.23506043],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.79190505,  0.50038436],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.51513614,  0.36772239],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.30755945,  0.23506043]])

In [19]:
X_train.shape, X_test.shape
y_train.shape, y_test.shape

((800,), (200,))

In [20]:
def evaluate_model(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    
    return mse, rmse, r2, mae

In [21]:
models={
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regression': SVR(),
    'CatBoost': CatBoostRegressor(verbose=0),
    'XGBoost': XGBRegressor(use_label_encoder=False, eval_metric='rmse'),
    'LightGBM': LGBMRegressor()
}

model_list=[]
r2_scores=[]

for i in range(len(list(models))):
    model_name=list(models.keys())[i]
    model=models[model_name]
    
    model.fit(X_train,y_train)
    
    y_pred=model.predict(X_test)
    
    mse, rmse, r2, mae = evaluate_model(y_test, y_pred)
    
    print(f'{model_name} - MSE: {mse}, RMSE: {rmse}, R2: {r2}, MAE: {mae}')
    
    model_list.append(model_name)
    r2_scores.append(r2)

Linear Regression - MSE: 29.095169866715537, RMSE: 5.393993869732848, R2: 0.8804332983749563, MAE: 4.214763142474853
Ridge Regression - MSE: 29.05660162669478, RMSE: 5.390417574427308, R2: 0.8805917946912827, MAE: 4.211112826071162
Lasso Regression - MSE: 42.47580702071846, RMSE: 6.517346624257334, R2: 0.825445523514914, MAE: 5.155719544015369
K-Nearest Neighbors - MSE: 52.0418, RMSE: 7.214000277238697, R2: 0.7861340421404924, MAE: 5.575
Decision Tree - MSE: 59.315, RMSE: 7.701623205532714, R2: 0.7562448014781062, MAE: 6.115
Random Forest - MSE: 36.1894128125, RMSE: 6.015763693206375, R2: 0.8512794823484497, MAE: 4.649466666666667
Gradient Boosting - MSE: 30.9934880775489, RMSE: 5.567179544217063, R2: 0.8726321530940087, MAE: 4.296229317760538
Support Vector Regression - MSE: 65.91917898190714, RMSE: 8.119062691093545, R2: 0.7291049050133183, MAE: 5.3951430812872605
CatBoost - MSE: 36.055319852755574, RMSE: 6.004608218090134, R2: 0.8518305378322716, MAE: 4.608757793199379
XGBoost - MSE

In [22]:
r2_scores

[0.8804332983749563,
 0.8805917946912827,
 0.825445523514914,
 0.7861340421404924,
 0.7562448014781062,
 0.8512794823484497,
 0.8726321530940087,
 0.7291049050133183,
 0.8518305378322716,
 0.8277965784072876,
 0.8411071675178521]