In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

#Modelling

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [2]:
df=pd.read_csv('data/student_perf.csv')

In [3]:
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X = df.drop(columns=['math score'],axis=1)

In [6]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
y = df['math score']

In [7]:
print("Categories in 'Gender': ", end=" ")
print(df['gender'].unique())

print("Categories in 'Race/Ethnicity': ", end=" ")
print(df['race/ethnicity'].unique())

print("Categories in 'Parental level of education': ", end=" ")
print(df['parental level of education'].unique())

print("Categories in 'Lunch': ", end=" ")
print(df['lunch'].unique())

print("Categories in 'Test Preparation Course': ", end=" ")
print(df['test preparation course'].unique())


Categories in 'Gender':  ['female' 'male']
Categories in 'Race/Ethnicity':  ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in 'Parental level of education':  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'Lunch':  ['standard' 'free/reduced']
Categories in 'Test Preparation Course':  ['none' 'completed']


In [None]:
#Column Transformer - 3 Types of Transformers
# Define Numerical and Categorical Features
# numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']    # dtype is not object
# categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']  # dtype is object
numerical_features=X.select_dtypes(exclude="object").columns
categorical_features=X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

# create pipeline
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoding", oh_transformer, categorical_features),
        ("StandardScaler", numeric_transformer, numerical_features)
    ]
)  # can be used to fit/transform 

In [10]:
X = preprocessor.fit_transform(X)

In [11]:
X.shape

(1000, 19)

In [12]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]], shape=(1000, 19))

In [14]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)
X_train.shape, X_test.shape


((800, 19), (200, 19))

In [27]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
   # mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_scr = r2_score(true, predicted)
    #return mae, mse, rmse, r2_scr
    return mae, rmse, r2_scr

In [28]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGB Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # training the model

    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Training Set")
    print("RSME: {:.4f}".format(model_train_rmse))
    print("MAE: {:.4f}".format(model_train_mae))
    print("R2 Score: {:.4f}".format(model_train_r2))

    print("/nModel Performance for Test Set")
    print("RSME: {:.4f}".format(model_train_rmse))
    print("MAE: {:.4f}".format(model_test_mae))
    print("R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

Linear Regression
Model Performance for Training Set
RSME: 5.3231
MAE: 4.2667
R2 Score: 0.8743
/nModel Performance for Test Set
RSME: 5.3231
MAE: 4.2148
R2 Score: 0.8804


Lasso
Model Performance for Training Set
RSME: 6.5938
MAE: 5.2063
R2 Score: 0.8071
/nModel Performance for Test Set
RSME: 6.5938
MAE: 5.1579
R2 Score: 0.8253


Ridge
Model Performance for Training Set
RSME: 5.3233
MAE: 4.2650
R2 Score: 0.8743
/nModel Performance for Test Set
RSME: 5.3233
MAE: 4.2111
R2 Score: 0.8806


KNeighbors Regressor
Model Performance for Training Set
RSME: 5.7079
MAE: 4.5168
R2 Score: 0.8555
/nModel Performance for Test Set
RSME: 5.7079
MAE: 5.6210
R2 Score: 0.7838


Decision Tree
Model Performance for Training Set
RSME: 0.2795
MAE: 0.0187
R2 Score: 0.9997
/nModel Performance for Test Set
RSME: 0.2795
MAE: 6.3500
R2 Score: 0.7361


Random Forest Regressor
Model Performance for Training Set
RSME: 2.2841
MAE: 1.8182
R2 Score: 0.9769
/nModel Performance for Test Set
RSME: 2.2841
MAE: 4.6214
R2 Sco