# This file contains all the code related to model training

In [1]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Importing basic libraries**

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Importing model training related libraries**

In [7]:
!pip install catboost
!pip install xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [4]:
# loading dataset
df = pd.read_csv('/content/drive/MyDrive/stud.csv')

In [9]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Preparing X and Y

In [11]:
X = df.drop(columns=['math_score'],axis=1)

In [12]:
Y = df['math_score']

**Creating Column Transformer with 3 types of transformers**

In [13]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include = "object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categorial_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", categorial_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [14]:
X = preprocessor.fit_transform(X)

In [15]:
X.shape

(1000, 19)

Spliting the dataset into testing and training

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 101)

**Creating an Evaluate Function to give all metrics after model Training**

In [18]:
def evaluate_model(true, predicted):
  mae = mean_absolute_error(true, predicted)
  r2_sq = r2_score(true, predicted)
  rmse = np.sqrt(mean_squared_error(true, predicted))
  mse = mean_absolute_error(true, predicted)
  return mae, r2_sq, rmse, mse

**Training models**

In [21]:
models = {
    "Linear Regression" : LinearRegression(),
    "Ridge" : Ridge(),
    "K - Neighbours Regression" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "XGBoost" :XGBRegressor(),
    "CatBoost" : CatBoostRegressor(verbose=False),
}

model_list = []
r2_sq_list = []

for i in range(len(list(models))):
  model = list(models.values())[i]
  model.fit(X_train, Y_train)

  y_train_pred = model.predict(X_train)
  y_test_pred = model.predict(X_test)

  model_train_mae, model_train_r2_sq, model_train_rmse, model_train_mse = evaluate_model(Y_train, y_train_pred)
  model_test_mae, model_test_r2_sq, model_test_rmse, model_testn_mse = evaluate_model(Y_test, y_test_pred)

  print(list(models.keys())[i])
  model_list.append(list(models.keys())[i])

  print('Model performance for Training set')
  print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
  print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
  print("- R2 Score: {:.4f}".format(model_train_r2_sq))

  print('----------------------------------')

  print('Model performance for Test set')
  print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
  print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
  print("- R2 Score: {:.4f}".format(model_test_r2_sq))
  r2_sq_list.append(model_test_r2_sq)

  print('='*35)
  print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.3315
- Mean Absolute Error: 4.2426
- R2 Score: 0.8806
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3578
- Mean Absolute Error: 4.2919
- R2 Score: 0.8539


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3317
- Mean Absolute Error: 4.2429
- R2 Score: 0.8805
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3527
- Mean Absolute Error: 4.2862
- R2 Score: 0.8542


K - Neighbours Regression
Model performance for Training set
- Root Mean Squared Error: 5.9667
- Mean Absolute Error: 4.7060
- R2 Score: 0.8504
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.4097
- Mean Absolute Error: 5.2750
- R2 Score: 0.7909


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.4330
- Mean Absolute Error: 0.0325
- R2 Score: 0.9992
-----------

In [22]:
pd.DataFrame(list(zip(model_list, r2_sq_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
1,Ridge,0.854184
0,Linear Regression,0.853903
6,CatBoost,0.843504
4,Random Forest,0.827015
5,XGBoost,0.799802
2,K - Neighbours Regression,0.790909
3,Decision Tree,0.697669


In [24]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, Y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(Y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 85.39


In [25]:
pred_df = pd.DataFrame({'Actual Value' : Y_test, 'Predicted Value' : y_pred, 'Difference': Y_test - y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
545,78,85.514769,-7.514769
298,40,49.716993,-9.716993
109,70,63.774297,6.225703
837,75,68.319564,6.680436
194,69,72.230529,-3.230529
...,...,...,...
68,61,60.853802,0.146198
449,81,79.414174,1.585826
715,76,75.080095,0.919905
793,89,86.916285,2.083715
