# Step 5. Modelling and Hyperparameter Tuning

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from datetime import datetime, date
from scipy.stats import randint
import warnings  
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy")

## Data

In [2]:
df = pd.read_csv('preprocessed_collegedata.csv')
print(df.head())

   student_count  awards_per_value  awards_per_state_value  \
0           4051                14                    18.8   
1          11502                20                    18.8   
2            322                29                    17.8   
3           5696                20                    18.8   
4           5356                11                    18.8   

   awards_per_natl_value  exp_award_value  exp_award_state_value  \
0                   21.5         105331.0                  75743   
1                   21.5         136546.0                  75743   
2                   22.5          58414.0                  92268   
3                   21.5          64418.0                  75743   
4                   21.5         132407.0                  75743   

   exp_award_natl_value  ft_pct  fte_value  aid_value  ...  grad_150_value  \
0                 66436    93.8       3906     7142.0  ...              29   
1                 66436    72.7       2157     6088.0  ...    

## Train/Test Split Data

In [3]:
X = df.drop(columns=['awards_per_value'])
y = df['awards_per_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

X_train.shape, X_test.shape

((3038, 21), (760, 21))

## Scaled X_train, X_test data

In [4]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train/test several Models:

### Define Models.

In [5]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Elastic Net': ElasticNet(),
    'SVR': SVR()
}


**(a). Using unscaled data.**

In [6]:
# Train and test the models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # Calculate RMSE
    rmse = np.sqrt(mse)
    results[name] = rmse

# Results
for name, rmse in results.items():
    print(f'{name}: RMSE = {rmse}')

Linear Regression: RMSE = 5.359441840192864
Ridge Regression: RMSE = 5.3595327871102185
Lasso Regression: RMSE = 5.384823298535275
Decision Tree: RMSE = 6.848357467305573
Random Forest: RMSE = 4.905253132048711
Gradient Boosting: RMSE = 4.975539903715877
Elastic Net: RMSE = 5.386298905036575
SVR: RMSE = 5.968143509928486


**(b). Using scaled data.**

In [7]:
# Train and test the models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    # Calculate RMSE
    rmse = np.sqrt(mse)
    results[name] = rmse

# Results
for name, rmse in results.items():
    print(f'{name}: RMSE = {rmse}')

Linear Regression: RMSE = 5.3594418401929405
Ridge Regression: RMSE = 5.35945079345077
Lasso Regression: RMSE = 5.767405839321374
Decision Tree: RMSE = 7.076555064361993
Random Forest: RMSE = 4.871606227605235
Gradient Boosting: RMSE = 4.987242358245183
Elastic Net: RMSE = 5.66861080535132
SVR: RMSE = 5.021728838208683


**- Random Forest model has the best performance while Gradient Boosting is the second best.**

## Random Forest Regression

In [8]:
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 18)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse_1 = mean_squared_error(y_test, y_pred)
rmse = mse**.5

print(mse, rmse)

25.21776052449673 5.021728838208683


### GridSearchCV

**(a). Create/Train the model.**

In [9]:
# Define Grid 
grid = { 
    'n_estimators': [200,300,400],
    'max_features': ['sqrt','log2'],
    'max_depth' : [3,4,5,6,7,8,9],
    'random_state' : [21]
}
# show start time
print(datetime.now())
# Grid Search function
CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, cv= 5)
CV_rfr.fit(X_train, y_train)
# show end time
print(datetime.now())

2024-06-27 22:11:44.575011
2024-06-27 22:21:31.501016


**- The model is expensive, as we see it took about 11 minutes to run.**

In [10]:
# Print the best parameters
print(f"Best parameters found by GridSearchCV: ", CV_rfr.best_params_)
print(f"Best score from GridSearchCV: ", CV_rfr.best_score_)

Best parameters found by GridSearchCV:  {'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 300, 'random_state': 21}
Best score from GridSearchCV:  0.4341585257433283


**(b). Evaluate the model.**

In [11]:
# Evaluate the best model on the test set
best_grid_model = CV_rfr.best_estimator_
y_pred_grid = best_grid_model.predict(X_test)

# Calculate RMSE
rmse_grid = np.sqrt(mean_squared_error(y_test, y_pred_grid))

# Calculate R^2 score
r2_grid = r2_score(y_test, y_pred_grid)

# Evaluation metrics
print("Test RMSE for GridSearchCV: ", rmse_grid)
print("Test R^2 score for GridSearchCV: ", r2_grid)

Test RMSE for GridSearchCV:  4.860469992848627
Test R^2 score for GridSearchCV:  0.44701645091445996


In [12]:
# Create and train model using the above best parameters
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 9, random_state = 21)
rf.fit(X_train, y_train)
# Predict on test data
y_pred = rf.predict(X_test)
# Compute mean squared error
mse = mean_squared_error(y_test, y_pred)
# Print results
print(mse**.5)

4.860469992848627


## RandomSearchCV

**(a). Train/test the model.**

In [13]:
# Define parameters
random_grid = { 
    'n_estimators': randint(200, 600),
    'max_features': ['sqrt', 'log2'],
    'max_depth': randint(3, 10),
    'random_state': [21]
}
# Show start time
print("Start time:", datetime.now())

# Perform random search
r_search = RandomizedSearchCV(estimator=RandomForestRegressor(), 
                              param_distributions=random_grid, n_iter=100, cv=5, verbose=2, random_state=21, n_jobs=-1)
r_search.fit(X_train, y_train)

# Show end time
print("End time:", datetime.now())

# Print the best parameters
print("Best parameters found: ", r_search.best_params_)

# Print the best score
print("Best score: ", r_search.best_score_)

Start time: 2024-06-27 22:21:37.103165
Fitting 5 folds for each of 100 candidates, totalling 500 fits
End time: 2024-06-27 22:28:36.350822
Best parameters found:  {'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 393, 'random_state': 21}
Best score:  0.43379214697641777


**(b). Evaluate the model.**

In [14]:
# Evaluate the best model on the test set
best_model = r_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Test RMSE: ", rmse)
print("Test R^2 score: ", r2)

Test RMSE:  4.8578266080104875
Test R^2 score:  0.4476177717097276
