In [1]:
#Importing Data and required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [2]:
#Read the dataset
df = pd.read_csv('/home/Rahul/Desktop/End_to_End_ML_Projects/Notebook/Data/StudentsPerformance.csv')

In [3]:
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
#Preparing the variables (X,y) here prediction is math score
X = df.drop(columns = 'math score', axis =1)
y = df['math score']

In [5]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

In [7]:
# Creation of Column transformer for transforming num_feature in lower(standard values) and categroical with lables
num_features = X.select_dtypes(exclude = 'object').columns
cat_features = X.select_dtypes(include = 'object').columns

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', cat_transformer, cat_features,),
        ('StandardScaler', num_transformer, num_features)
    ]
)


In [8]:
print(cat_features)

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course'],
      dtype='object')


In [9]:
#Train test split of data for model training
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 45)

In [10]:
#Transform the data with the help of the preprocessor
X_train_scaled= preprocessor.fit_transform(X_train)
X_test_scaled= preprocessor.transform(X_test)

In [18]:
#Creation of evaluation function for all metrics
def evaluate_model(true, predicted):
    #r2_score = r2_score(true, predicted), 
    mse = mean_squared_error(true, predicted), 
    mae = mean_absolute_error(true, predicted), 
    rmse = root_mean_squared_error(true, predicted)
    return  mse, mae, rmse

In [19]:
#For testing all regression model in one code create a dictionary of models 
model = {
    "linear regressor" : LinearRegression(),
    "lasso" : Lasso() ,
    'Ridge': Ridge() ,
    'K-Neighbours regressor': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBReressor': XGBRegressor(),
    'CatBoosting regressor': CatBoostRegressor(verbose = False),
    'AdaBoost regressor': AdaBoostRegressor(),           
    "Random_forest" : RandomForestRegressor(),    
}

model_list = []
rmse_list = []
for name, model in model.items():
    model.fit(X_train_scaled, y_train)
    
    #Make prediction
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    #Evaluate the train and test dataset
    
    model_train_mse, model_train_mae, model_train_rmse = evaluate_model(y_train, y_train_pred)
    model_test_mse, model_test_mae, model_test_rmse = evaluate_model(y_test, y_test_pred)

    print(f"Model: {name}")
    model_list.append(name)
          

    print('Model performance on training dataset')
    #print('-R2_score : {:.4f}'.format(model_train_r2_score))
    print('-Mean squared error: ' , {model_train_mse})
    print('-Mean Absolute error: ', {model_train_mae})
    print('-Root Mean Squared Error:', {model_train_rmse})

    print('-------------------------------------------------------------')
    
    print('Model performance on test dataset')
    #print('-R2_score : {:.4f}'.format(model_test_r2_score))
    print('-Mean squared error:' , {model_test_mse})
    print('-Mean Absolute error: ', {model_test_mae})
    print('-Root Mean Squared Error:', {model_test_rmse})
    rmse_list.append(model_test_rmse)


    print('+' * 35)
    print('+' * 35)
    

Model: linear regressor
Model performance on training dataset
-Mean squared error:  {(28.218672956006717,)}
-Mean Absolute error:  {(4.215589211261674,)}
-Root Mean Squared Error: {5.3121250885127616}
-------------------------------------------------------------
Model performance on test dataset
-Mean squared error: {(29.314085103916,)}
-Mean Absolute error:  {(4.371625150159658,)}
-Root Mean Squared Error: {5.414248341544374}
+++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
Model: lasso
Model performance on training dataset
-Mean squared error:  {(42.94281017112817,)}
-Mean Absolute error:  {(5.14503259980304,)}
-Root Mean Squared Error: {6.553076389843794}
-------------------------------------------------------------
Model performance on test dataset
-Mean squared error: {(41.884374140733165,)}
-Mean Absolute error:  {(5.2444099442945795,)}
-Root Mean Squared Error: {6.471813821544402}
+++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++
Mode

In [20]:
pd.DataFrame(list(zip(model_list, rmse_list)),columns = ['Model Name', 'RMSE_Score']).sort_values(by = ['RMSE_Score'], ascending = False)

Unnamed: 0,Model Name,RMSE_Score
4,Decision Tree,8.502059
3,K-Neighbours regressor,6.905404
5,XGBReressor,6.546818
1,lasso,6.471814
7,AdaBoost regressor,6.244229
8,Random_forest,6.1898
6,CatBoosting regressor,6.024371
0,linear regressor,5.414248
2,Ridge,5.412581


In [23]:
pd.DataFrame({'Actual Value' : y_test, 'Predicted Value':y_test_pred, 'Difference': y_test-y_test_pred})

Unnamed: 0,Actual Value,Predicted Value,Difference
726,65,65.700000,-0.700000
243,59,56.090000,2.910000
342,69,63.680000,5.320000
976,60,63.760000,-3.760000
919,91,94.670000,-3.670000
...,...,...,...
458,100,97.163333,2.836667
169,67,55.940000,11.060000
297,71,78.060000,-7.060000
10,58,56.680000,1.320000
