In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('data/stud.csv')

In [3]:
df.head(
)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X=df.drop(columns=['math_score'],axis=1)

In [5]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [15]:
numeric_columns=[i for i in X.columns if df[i].dtype!='object']
categorical_columns=[i for i in X.columns if df[i].dtype=='object']

In [16]:
numeric_columns

['reading_score', 'writing_score']

In [17]:
categorical_columns

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [12]:
for column in categorical_columns:
    print(f'The number of unique values in column {column} are {df[column].nunique()} ')
    print(f'The unique values are: {df[column].unique()}')

The number of unique values in column gender are 2 
The unique values are: ['female' 'male']
The number of unique values in column race_ethnicity are 5 
The unique values are: ['group B' 'group C' 'group A' 'group D' 'group E']
The number of unique values in column parental_level_of_education are 6 
The unique values are: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
The number of unique values in column lunch are 2 
The unique values are: ['standard' 'free/reduced']
The number of unique values in column test_preparation_course are 2 
The unique values are: ['none' 'completed']


In [13]:
y=df['math_score']

In [14]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

scalar=StandardScaler()
oh_encode=OneHotEncoder()

In [20]:
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh_encode,categorical_columns),
        ("StandardScaler",scalar,numeric_columns)
    ]
)

In [21]:
X=preprocessor.fit_transform(X)

In [22]:
X.shape

(1000, 19)

In [23]:
df.shape

(1000, 8)

In [25]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=42)

In [28]:
X_train.shape,X_test.shape

((900, 19), (100, 19))

In [34]:
def evaluate_model(actual,predicted):
    mae=mean_absolute_error(actual,predicted)
    mse=mean_squared_error(actual,predicted)
    rmse=np.sqrt(mean_squared_error(actual,predicted))
    r2_sc=r2_score(actual,predicted)

    return mae,mse,rmse,r2_sc

In [37]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "Cat Boost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoostRegressor" : AdaBoostRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    train_mae,train_mse,train_rmse,train_r2_score=evaluate_model(y_train,y_train_pred)
    test_mae,test_mse,test_rmse,test_r2_score=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])

    model_list.append(list(models.keys())[i])

    print("Model Performance on Training data")
    print(f"Root Mean Square Value {train_rmse:.4f}")
    print(f"Mean Square Error :{train_mse:.4f}")
    print(f"Mean Absolute Error : {train_mae:.4f}")
    print(f"R2 score:{train_r2_score:.4f}")

    print("------------------------------------------------------")

    print("Model Performance on Testing data")
    print(f"Root Mean Square Value {test_rmse:.4f}")
    print(f"Mean Square Error :{test_mse:.4f}")
    print(f"Mean Absolute Error : {test_mae:.4f}")
    print(f"R2 score:{test_r2_score:.4f}")

LinearRegression
Model Performance on Training data
Root Mean Square Value 5.3438
Mean Square Error :28.5560
Mean Absolute Error : 4.2679
R2 score:0.8710
------------------------------------------------------
Model Performance on Testing data
Root Mean Square Value 5.2055
Mean Square Error :27.0972
Mean Absolute Error : 4.0722
R2 score:0.9096
Lasso
Model Performance on Training data
Root Mean Square Value 6.6200
Mean Square Error :43.8249
Mean Absolute Error : 5.2520
R2 score:0.8020
------------------------------------------------------
Model Performance on Testing data
Root Mean Square Value 6.4284
Mean Square Error :41.3245
Mean Absolute Error : 4.8939
R2 score:0.8621
Ridge
Model Performance on Training data
Root Mean Square Value 5.3440
Mean Square Error :28.5582
Mean Absolute Error : 4.2663
R2 score:0.8710
------------------------------------------------------
Model Performance on Testing data
Root Mean Square Value 5.2024
Mean Square Error :27.0647
Mean Absolute Error : 4.0674
R2 