In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [27]:
data=pd.read_csv('data/stud.csv')

In [28]:
data.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [29]:
x=data.drop('math score',axis=1)

In [30]:
y=data['math score']

In [31]:
num_data=x.select_dtypes([int,float]).columns
cat_data=x.select_dtypes([object]).columns

In [32]:
one_hot_encoding=OneHotEncoder()
standard_scaler=StandardScaler()


In [33]:
from sklearn.compose import ColumnTransformer
preprocessor=ColumnTransformer([('num',standard_scaler,num_data),
                                 ('cat',one_hot_encoding,cat_data) ,])

In [34]:
X=preprocessor.fit_transform(x)

In [35]:
X

array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [36]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.30,random_state=42)

In [37]:
xtrain.shape

(700, 19)

In [38]:
xtest.shape

(300, 19)

In [39]:
print(ytrain.shape)
print(ytest.shape)

(700,)
(300,)


In [40]:
def evaluate_model(true,predict):
    mae=mean_absolute_error(true,predict)
    mse=mean_squared_error(true,predict)
    rmse=np.sqrt(mean_squared_error(true,predict))
    r2=r2_score(true,predict)
    return mae,rmse,r2

In [41]:
models={
    'LinearRegression':LinearRegression(),
    "lasso":Lasso(),
    "Ridge":Ridge(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor()
    
}

In [42]:
model_list=[]
r2_list=[]

In [43]:

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(xtrain,ytrain)
  #Make predictions 
    y_train_pred=model.predict(xtrain)
    y_test_pred=model.predict(xtest)
    model_train_mae,model_train_rmse,model_train_r2=evaluate_model(ytrain,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2=evaluate_model(ytest,y_test_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('-'*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 5.2483
- Mean Absolute Error: 4.1985
- R2 Score: 0.8751
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.5576
- Mean Absolute Error: 4.4183
- R2 Score: 0.8759
-----------------------------------


lasso
Model performance for Training set
- Root Mean Squared Error: 6.5106
- Mean Absolute Error: 5.1582
- R2 Score: 0.8078
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.8705
- Mean Absolute Error: 5.3929
- R2 Score: 0.8103
-----------------------------------


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.2487
- Mean Absolute Error: 4.1977
- R2 Score: 0.8751
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.5566
- Mean Absolute Error: 4.4155
- R2 Score: 0.8759
-----------------------------------


DecisionTreeRegressor
Model performance for Training set

In [46]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['MdoelName',"R2Score"]).sort_values(by=['R2Score'],ascending=False)

Unnamed: 0,MdoelName,R2Score
2,Ridge,0.875907
0,LinearRegression,0.875863
8,CatBoostRegressor,0.852108
4,RandomForestRegressor,0.845642
6,AdaBoostRegressor,0.835561
7,XGBRegressor,0.811029
1,lasso,0.810282
5,KNeighborsRegressor,0.772127
3,DecisionTreeRegressor,0.730612


In [51]:
Difference=pd.DataFrame({'actual_values':ytest,'predicted_value':y_test_pred,'Difference':ytest-y_test_pred})

In [48]:
Difference

Unnamed: 0,actual_values,predicted_value,Difference
521,91,74.390762,16.609238
737,53,55.530637,-2.530637
740,80,76.404788,3.595212
660,74,76.340790,-2.340790
411,84,87.102284,-3.102284
...,...,...,...
468,77,71.463336,5.536664
935,70,61.108344,8.891656
428,65,60.235512,4.764488
7,40,46.077139,-6.077139
