## Model Training

#### Import data and requried packages

In [66]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


In [67]:
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [68]:
df = pd.read_csv('data/stud.csv')

In [69]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [71]:
df.nunique()

gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64

In [72]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


#### Preparing X and Y variable

In [73]:
X = df.drop(columns='math_score',axis=1)

In [74]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [75]:
y = df['math_score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [76]:
df.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

In [77]:
# Create Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

ct = ColumnTransformer([
        ('scale', StandardScaler(), ['reading_score','writing_score']),
        ('encode', OneHotEncoder(), ['gender','race_ethnicity','parental_level_of_education','lunch','test_preparation_course'])
    ])

In [78]:
X = ct.fit_transform(X)

In [79]:
X.shape

(1000, 19)

In [80]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

## Linear Regression

#### RandomizedSearchCV on a linear regression model

In [81]:
from scipy.stats import uniform

# Initialize the linear regression model
lr = LinearRegression()

# Define the hyperparameter distribution to randomly sample from
param_dist = {
              'fit_intercept': [True, False],
              'copy_X': [True, False],
              'n_jobs': [1, 2, 3, 4, 5],
              'positive': [True, False],
              }

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(lr, param_distributions=param_dist, cv=5, n_iter=50, n_jobs=-1)

# Fit the model with the randomly sampled hyperparameters
random_search.fit(X, y)

# Print the best hyperparameters and resulting score
print("Best hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)



Best hyperparameters:  {'positive': False, 'n_jobs': 5, 'fit_intercept': True, 'copy_X': True}
Best score:  0.8719524902553306


#### Tune hyperparameters in a linear regression model using Gridsearchcv

In [84]:
from sklearn.model_selection import GridSearchCV


# Initialize the linear regression model
lr = LinearRegression()

# Define the hyperparameters to search over
param_grid = {'copy_X' : [True,False],
              'fit_intercept': [True, False],
              'n_jobs': [3, 4, 5],
              'positive': [True, False]
             }

# Initialize the GridSearchCV object
grid_search = GridSearchCV(lr, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model with the hyperparameters specified in param_grid
grid_search.fit(X, y)

# Print the best hyperparameters and resulting score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Train the Linear regression model using the selected values  on the entire training set
best_params = grid_search.best_params_
lr = LinearRegression(**best_params)
lr.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = lr.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r2_score on training set:", r2_square)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = lr.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)
print("r2_score on test set:", r2_square)


Best hyperparameters:  {'copy_X': True, 'fit_intercept': True, 'n_jobs': 4, 'positive': False}
Best score:  0.8719524902553306
RMSE on training set: 5.323050852720514
r2_score on training set: 0.8743172040139593
RMSE on test set: 5.393993869732841
r2_score on test set: 0.8804332983749565


## Ridge Regression

#### GridSearchCV

In [85]:

# Create a range of values for the regularization strength parameter alpha that you want to test
param_grid = {'alpha': np.logspace(-4, 4, 9)}

# Perform k-fold cross-validation on the training set using each of the values of alpha
ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# For each value of alpha, calculate the mean and standard deviation of the cross-validation scores
cv_results = grid_search.cv_results_
for mean_score, std_score, params in zip(cv_results['mean_test_score'], cv_results['std_test_score'], cv_results['params']):
    print(np.sqrt(-mean_score), np.sqrt(std_score), params)

# Train the Ridge regression model using the selected value of alpha on the entire training set
ridge = Ridge(alpha=best_alpha)
ridge.fit(X_train, y_train)

print("=====================")

# Evaluate the model on the training set
y_train_pred = ridge.predict(X_train)
r2_square = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("r2_score on training set:", r2_square)

print("=====================")

# Evaluate the model on the testing set
y_test_pred = ridge.predict(X_test)
r2_square = r2_score(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on te set:", rmse)
print("r2_score on training set:", r2_square)

5.4302736469019 1.4861492972218349 {'alpha': 0.0001}
5.430272964266233 1.4861590204555781 {'alpha': 0.001}
5.430266181206306 1.48625620726002 {'alpha': 0.01}
5.430202647495649 1.4872235514013394 {'alpha': 0.1}
5.429966345580281 1.4964722209253307 {'alpha': 1.0}
5.449773455868539 1.5631928900991339 {'alpha': 10.0}
5.949213543668942 1.8088292394189653 {'alpha': 100.0}
9.518012201615885 2.168149942025443 {'alpha': 1000.0}
13.857136953816484 3.5015818266168544 {'alpha': 10000.0}
RMSE on training set: 5.323324922741655
RMSE on training set: 5.3903870169356365


## Lasso Regression

In [58]:
# define Lasso regression model
lasso = Lasso()

# define hyperparameters to tune
param_grid = {'alpha': [0.1, 1.0, 10.0],
              'max_iter': [1000, 5000, 10000],
              'tol': [0.001, 0.0001, 0.00001],
              'selection': ['cyclic', 'random']}

# perform grid search to find best hyperparameters
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# print best hyperparameters and corresponding score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Train the Ridge regression model using the selected value of alpha on the entire training set
lasso = Ridge(alpha=best_alpha)
lasso.fit(X_train, y_train)

# Evaluate the model on the training set
y_train_pred = lasso.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)

# Evaluate the model on the testing set
y_test_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)

Best hyperparameters: {'alpha': 0.1, 'max_iter': 5000, 'selection': 'random', 'tol': 0.001}
Best score: 0.8666397542239206
RMSE on training set: 5.323324922741655
RMSE on training set: 5.3903870169356365


## KNN regression

In [64]:
# Define the model
knn = KNeighborsRegressor()

# Define the parameter grid to search
param_grid = {'n_neighbors': [3, 5, 7], 'metric': ['euclidean', 'manhattan']}

# Define the grid search object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
knn = KNeighborsRegressor(n_neighbors=best_params['n_neighbors'], metric=best_params['metric'])
knn.fit(X_train, y_train)


print('Best score:', grid_search.best_score_)

# Evaluate the model on the training set
y_train_pred = knn.predict(X_train)
model_train_r2 = r2_score(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("RMSE on training set:", rmse)
print("R_2 score on training set:", model_train_r2)

# Evaluate the model on the testing set
y_test_pred = knn.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("RMSE on test set:", rmse)



Best score: -49.16339285714285


TypeError: 'numpy.float64' object is not callable