In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset
train_set = pd.read_csv('stud.csv')

In [3]:
train_set.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
# Basic info
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [5]:
# Duplicate values
train_set.duplicated().sum()


0

## Feature Engineering

In [6]:
# Total score
train_set['total score'] = train_set['math score'] + train_set['reading score'] + train_set['writing score']
# Average score
train_set['average score'] = round(train_set['total score'] /3,2)

In [7]:
train_set.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.67
1,female,group C,some college,standard,completed,69,90,88,247,82.33
2,female,group B,master's degree,standard,none,90,95,93,278,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.33
4,male,group C,some college,standard,none,76,78,75,229,76.33


In [8]:
# Assigning features to x and y
x = train_set.drop(columns='total score')
y = train_set['total score']

In [9]:
X_train, X_test, y_train, y_test =train_test_split(x, y,test_size=0.2, random_state=23)

In [10]:
cat_features = X_train.select_dtypes('object').columns
num_features = X_train.select_dtypes(exclude='object').columns

In [32]:
# Transformer 1
trnf_1 = ColumnTransformer([
    ('ohe',OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'), cat_features),
    ('ss', StandardScaler(), num_features)
], remainder='passthrough')

In [33]:
X_train_new = trnf_1.fit_transform(X_train)
X_test_new =trnf_1.transform(X_test)

In [35]:
X_train_new

array([[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.77294003e-01, -2.79958916e-01,  7.58475188e-04],
       [ 1.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
        -1.05071641e+00, -7.38639410e-01, -6.49960927e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.88286402e+00,  1.81686620e+00,  1.65161895e+00],
       ...,
       [ 1.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
        -5.73156803e-01, -9.35216764e-01, -7.89450617e-01],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         9.95967609e-01,  8.99505211e-01,  1.00020210e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00, ...,
         6.54853607e-01,  2.44247363e-01,  7.05033201e-02]])

In [36]:
# Model evaluation metrics
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [37]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_new, y_train)  # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train_new)
    y_test_pred = model.predict(X_test_new)
    
    # Evaluate train and test data metrics
    model_train_mae, model_train_mse,model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae,model_test_mse,model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model performance for Training set:")
    print("Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("Mean Squared Error: {:.4f}".format(model_train_mse))
    print("Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("R2 Score {:.4f}".format(model_train_r2))
    
    print("Model performance for Test set:")
    print("Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("Mean Squared Error: {:.4f}".format(model_test_mse))
    print("Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("R2 Score {:.4f}".format(model_test_r2))
    
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set:
Mean Absolute Error: 0.0000
Mean Squared Error: 0.0000
Root Mean Squared Error: 0.0000
R2 Score 1.0000
Model performance for Test set:
Mean Absolute Error: 0.0000
Mean Squared Error: 0.0000
Root Mean Squared Error: 0.0000
R2 Score 1.0000


Lasso
Model performance for Training set:
Mean Absolute Error: 0.8019
Mean Squared Error: 1.0001
Root Mean Squared Error: 1.0000
R2 Score 0.9995
Model performance for Test set:
Mean Absolute Error: 0.7591
Mean Squared Error: 0.9385
Root Mean Squared Error: 0.9688
R2 Score 0.9995


Ridge
Model performance for Training set:
Mean Absolute Error: 0.0141
Mean Squared Error: 0.0003
Root Mean Squared Error: 0.0177
R2 Score 1.0000
Model performance for Test set:
Mean Absolute Error: 0.0142
Mean Squared Error: 0.0003
Root Mean Squared Error: 0.0178
R2 Score 1.0000


K-Neighbors Regressor
Model performance for Training set:
Mean Absolute Error: 4.0445
Mean Squared Error: 27.0235
Root Mean Squared Error: 5.1

In [38]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name','R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Linear Regression,1.0
2,Ridge,1.0
1,Lasso,0.999457
4,Decision Tree,0.99932
5,Random Forest Regressor,0.999199
6,AdaBoost Regressor,0.993343
3,K-Neighbors Regressor,0.97963
