## Model Training

#### Import data and requried packages

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


In [24]:
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [3]:
df = pd.read_csv('data/stud.csv')

In [4]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [6]:
df.nunique()

gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64

In [7]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


#### Preparing X and Y variable

In [8]:
X = df.drop(columns='math_score',axis=1)

In [9]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [10]:
y = df['math_score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [11]:
df.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

In [12]:
# Create Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct = ColumnTransformer([
        ('scale', StandardScaler(), ['reading_score','writing_score']),
        ('encode', OneHotEncoder(), ['gender','race_ethnicity','parental_level_of_education','lunch','test_preparation_course'])
    ])

In [13]:
X = ct.fit_transform(X)

In [14]:
X.shape

(1000, 19)

In [15]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

## Linear Regression

#### Tune hyperparameters in a linear regression model using Gridsearchcv

In [16]:
from sklearn.model_selection import GridSearchCV


# Initialize the linear regression model
lr = LinearRegression()

# Define the hyperparameters to search over
param_grid = {'copy_X' : [True,False],
              'fit_intercept': [True, False],
              'n_jobs': [3, 4, 5],
              'positive': [True, False]
             }

# Initialize the GridSearchCV object
grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model with the hyperparameters specified in param_grid
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and resulting score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Train the Linear regression model using the selected values  on the entire training set
best_params = grid_search.best_params_
lr = LinearRegression(**best_params)
lr.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = lr.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r2_score on training set:", r2_square)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = lr.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)


Best hyperparameters:  {'copy_X': True, 'fit_intercept': False, 'n_jobs': 3, 'positive': False}
Best score:  0.8685885989218136
RMSE on training set: 5.323050852720514
r2_score on training set: 0.8743172040139593
RMSE on test set: 5.393993869732846
r2_score on test set: 0.8804332983749563


## Ridge Regression

#### GridSearchCV

In [17]:
ridge = Ridge()

# Create a range of values for the regularization strength parameter alpha that you want to test
param_grid = {'alpha': np.logspace(-4, 4, 9)}

# Perform k-fold cross-validation on the training set using each of the values of alpha
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and resulting score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Train the Ridge regression model using the selected value of alpha on the entire training set
best_alpha = grid_search.best_params_['alpha']
ridge = Ridge(alpha=best_alpha)
ridge.fit(X_train, y_train)


print("=====================")

# Evaluate the model on the training set
y_train_pred = ridge.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r2_score on training set:", r2_square)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = ridge.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)

Best hyperparameters:  {'alpha': 1.0}
Best score:  -29.484534514134474
RMSE on training set: 5.323324922741655
r2_score on training set: 0.8743042615212909
RMSE on test set: 5.3903870169356365
r2_score on test set: 0.8805931485028738


## Lasso Regression

In [18]:
# define Lasso regression model
lasso = Lasso()

# define hyperparameters to tune
param_grid = {'alpha': [0.1, 1.0, 10.0],
              'max_iter': [1000, 5000, 10000],
              'tol': [0.001, 0.0001, 0.00001],
              'selection': ['cyclic', 'random']}

# perform grid search to find best hyperparameters
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# print best hyperparameters and corresponding score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Train the Ridge regression model using the selected value of alpha on the entire training set
best_params = grid_search.best_params_
lasso = Lasso(**best_params)
lasso.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = lasso.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r2_score on training set:", r2_square)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = lasso.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)

Best hyperparameters: {'alpha': 0.1, 'max_iter': 5000, 'selection': 'random', 'tol': 0.001}
Best score: 0.8666643475174984
RMSE on training set: 5.396547214732501
r2_score on training set: 0.8708225921449809
RMSE on test set: 5.371112095816639
r2_score on test set: 0.8814455706710169


## KNN regression

In [19]:
# Define the model
knn = KNeighborsRegressor()

# Define the parameter grid to search
param_grid = {'n_neighbors': [3, 5, 7], 'metric': ['euclidean', 'manhattan']}

# Define the grid search object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
knn = KNeighborsRegressor(**best_params)
knn.fit(X_train, y_train)


#print('Best score:', grid_search.best_score_)

print("=====================")

# Evaluate the model on the training set
y_train_pred = knn.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = knn.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)


RMSE on training set: 5.907978434597868
r_2 score on training set: 0.8451780952955499
RMSE on test set: 7.140199505587491
r2_score on test set: 0.7904874488383591


## Decision Tree Regression

In [20]:
dt = DecisionTreeRegressor(random_state=42)

# Define the parameter grid to search
param_grid = {
    'max_depth': [2, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_leaf_nodes': [None, 5, 10, 20]
}

# Define the grid search object
grid_search = GridSearchCV(dt, param_grid, cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(best_params)




{'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 8, 'min_samples_split': 2}


In [21]:
# Train the model with the best hyperparameters
dt = DecisionTreeRegressor(**best_params)
dt.fit(X_train, y_train)


print("=====================")

# Evaluate the model on the training set
y_train_pred = dt.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = dt.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)



RMSE on training set: 5.114615218100608
r_2 score on training set: 0.8839672630615256
RMSE on test set: 6.512374805353404
r2_score on test set: 0.8257117429661665


## Random Forest Regression

In [22]:
# Create a random forest regression model
rf = RandomForestRegressor(random_state=42)

# Define the grid of hyperparameters to search
params_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform a grid search of the hyperparameters using 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=params_grid, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters found
print(grid_search.best_params_)

# Train the model with the best hyperparameters
rf = RandomForestRegressor(**best_params)
rf.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = rf.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = rf.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)



{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 1000}
RMSE on training set: 4.018198936499479
r_2 score on training set: 0.9283827778067943
RMSE on test set: 5.917437888243576
r2_score on test set: 0.8561013339398114


## Ada Boost Regression

In [23]:
# Define the AdaBoostRegressor model
ada_boost = AdaBoostRegressor()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=ada_boost, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)


# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
ada_boost = AdaBoostRegressor(**best_params)
ada_boost.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = ada_boost.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = ada_boost.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)



Best hyperparameters: {'learning_rate': 1.0, 'loss': 'exponential', 'n_estimators': 200}
Best score: 0.8296744933482205
RMSE on training set: 5.6812980691148915
r_2 score on training set: 0.8568307487843236
RMSE on test set: 6.025270978915051
r2_score on test set: 0.8508090364309294


## GradientBoost Regression

In [29]:
# define the gradient boosting regressor model
gb = GradientBoostingRegressor()

# define the hyperparameters to be tuned
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.15],
    'min_samples_split': [2, 4, 6]
}

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(gb, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# print the best hyperparameters and the corresponding score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

# Get the best hyperparameters
best_params = grid_search.best_params_


# Train the model with the best hyperparameters
gb = GradientBoostingRegressor(**best_params)
gb.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = gb.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = gb.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)


Best Hyperparameters:  {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 6, 'n_estimators': 200}
Best Score:  0.8493044223680141
RMSE on training set: 4.647160680999199
r_2 score on training set: 0.9042078366710554
RMSE on test set: 5.5561629190557
r2_score on test set: 0.8731357386130197


## SVR

In [31]:
from sklearn import svm

# Create SVM regressor object
svr = svm.SVR()

# Define parameter grid
param_grid = {'C': [0.1, 1, 10, 100],
              'epsilon': [0.01, 0.1, 1, 10],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}


# Create grid search object
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Get the best hyperparameters
best_params = grid_search.best_params_


# Train the model with the best hyperparameters
svr = svm.SVR(**best_params)
svr.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = svr.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r_2 score on training set:", model_train_r2)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = svr.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)


Best hyperparameters:  {'C': 10, 'epsilon': 1, 'kernel': 'linear'}
Best score:  0.8664885252959795
RMSE on training set: 5.341044656242024
r_2 score on training set: 0.8734660628334536
RMSE on test set: 5.340943656915748
r2_score on test set: 0.8827736224683463


### Conclusion

    Based on the metrics , the linear regression model appears to be a reasonably good model for predicting the target variable. It has a training accuracy of 0.88 and a test accuracy of 0.87, which indicates that the model is performing consistently on both the training and test data. The RMSE on the training set (5.32) and the test set (5.39) are also reasonable, indicating that the model's predictions are not too far off from the actual values.
    In conclusion, the linear regression model provides a good starting point for predicting the target variable. However, it is important to continue exploring and refining the model to improve its accuracy and performance.
