In [12]:
### importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [5]:
### dataset

df = pd.read_csv('data/stud.csv')

df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
### X and Y variables

X = df.drop(columns=('math_score'), axis=1)
y = df['math_score']

In [7]:
## column transformer

num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()
encoder = OneHotEncoder(drop='first',sparse_output=False)

preprocessor = ColumnTransformer(
    [
        ("onehotencoder", encoder, cat_features),
        ("scaler",scaler,num_features)
    ]
)

X = preprocessor.fit_transform(X)

In [9]:
X.shape

(1000, 14)

In [10]:
### train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [14]:
## evaluation

def evaluate_model(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    rmse = root_mean_squared_error(actual, predicted)
    r2 = r2_score(actual, predicted)

    return mae, rmse, r2


In [22]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNR': KNeighborsRegressor(),
    'SVR': SVR(),
    'DT': DecisionTreeRegressor(),
    'RF': RandomForestRegressor(),
    'XGB': XGBRegressor(),
    'ABR':AdaBoostRegressor(),
    'Catboost': CatBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("training accuracy")
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for training set')
    print(f'Root mean squared error: {train_rmse}')
    print(f'mean absolute error: {train_mae}')
    print(f'R2 score: {train_r2}')

    print('Model performance for testing set')
    print(f'Root mean squared error: {test_rmse}')
    print(f'mean absolute error: {test_mae}')
    print(f'R2 score: {test_r2}')

    r2_list.append(test_r2)

    print('='*35)
    print('\n')


training accuracy
Linear Regression
Model performance for training set
Root mean squared error: 5.297244966554281
mean absolute error: 4.238267465725306
R2 score: 0.8742732380399838
Model performance for testing set
Root mean squared error: 5.482528123659775
mean absolute error: 4.337930600167405
R2 score: 0.8778243107659013


training accuracy
Ridge
Model performance for training set
Root mean squared error: 5.297738609100215
mean absolute error: 4.237067149041312
R2 score: 0.8742498043603679
Model performance for testing set
Root mean squared error: 5.481263082250013
mean absolute error: 4.3364427747772964
R2 score: 0.877880686026126


training accuracy
Lasso
Model performance for training set
Root mean squared error: 6.5515089629165475
mean absolute error: 5.183732838407028
R2 score: 0.8076862680441842
Model performance for testing set
Root mean squared error: 6.654135600779186
mean absolute error: 5.221710004672922
R2 score: 0.8200274147838305


training accuracy
KNR
Model performa

In [23]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['model name','r2 score']).sort_values(by=['r2 score'], ascending=False)

Unnamed: 0,model name,r2 score
1,Ridge,0.877881
0,Linear Regression,0.877824
9,Catboost,0.853325
6,RF,0.84709
8,ABR,0.837783
7,XGB,0.834163
2,Lasso,0.820027
3,KNR,0.787353
5,DT,0.756105
4,SVR,0.711069
