In [18]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [19]:
df=pd.read_csv('data/data.csv')

In [20]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Independant and Dependant Variables

In [21]:
X=df.drop(columns='math_score',axis=1)
y=df['math_score']

In [22]:
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [23]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [24]:
num_features=X.select_dtypes(exclude='object').columns
cat_features=X.select_dtypes(include='object').columns

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [("OneHotEncoder",oh_transformer,cat_features),
     ("StandaredScaler",numeric_transformer,num_features)]
)

In [25]:
X=preprocessor.fit_transform(X)

In [26]:
X.shape

(1000, 19)

### Train-Test Split

In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((750, 19), (250, 19), (750,), (250,))

### Model Evaluation Function

In [32]:
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    #rmse=np.sqrt(mean_squared_error(true,predicted))
    rsquared=r2_score(true,predicted)
    return mse,mae,rsquared

In [33]:
models={
    "Linear regression": LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Nearest Neighbours":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest regressor":RandomForestRegressor(),
    "XGBoost Regressor":XGBRegressor(),
    "Cat Boost Regressor":CatBoostRegressor(),
    "Ada Boost regressor":AdaBoostRegressor()    
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) # Train Model
    
    # Make Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear regression
Model performance for Training set
- Root Mean Squared Error: 4.2633
- Mean Absolute Error: 28.2305
- R2 Score: 0.8735
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.3230
- Mean Absolute Error: 29.7810
- R2 Score: 0.8790


Lasso
Model performance for Training set
- Root Mean Squared Error: 5.1837
- Mean Absolute Error: 42.9223
- R2 Score: 0.8077
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.2217
- Mean Absolute Error: 44.2775
- R2 Score: 0.8200


Ridge
Model performance for Training set
- Root Mean Squared Error: 4.2368
- Mean Absolute Error: 28.0642
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.3354
- Mean Absolute Error: 30.0173
- R2 Score: 0.8780


K-Nearest Neighbours
Model performance for Training set
- Root Mean Squared Error: 4.5861
- Mean Absolute Error: 33.5462
- R2 Score: 0.8497
-----------------

In [34]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'rsquared']).sort_values(by=["rsquared"],ascending=False)

Unnamed: 0,Model Name,rsquared
0,Linear regression,0.878951
2,Ridge,0.87799
7,Cat Boost Regressor,0.85483
5,Random Forest regressor,0.849171
8,Ada Boost regressor,0.839924
6,XGBoost Regressor,0.83629
1,Lasso,0.820027
3,K-Nearest Neighbours,0.793207
4,Decision Tree,0.763616
