<a href="https://colab.research.google.com/github/Rayers-Ranjitkar/2461847_Rayers_Rental/blob/main/Worksheet_8_Ensemble_Methods_Hyperparameter_Tuning_Rayers_Ranjitkar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Ensemble Methods and Hyperparamter Tuning**

**1. Implementing Classification Model**

In [20]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


# loading wine dataset
wine = load_wine()

# defining features and target
X = wine.data
y = wine.target

# splitting data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42 #Random state fixes the shuffling so, that test and train dataset are the same
)



# training decision tree classifier
dt_model = DecisionTreeClassifier(random_state=42) #If two different split gives the same IG, we are always choosing the same split by passing random_state
dt_model.fit(X_train, y_train)

# predicting values
dt_pred = dt_model.predict(X_test)

# calculating f1 score
dt_f1 = f1_score(y_test, dt_pred, average="macro") #macro -> Simple average of all f1 score of all classes as the dataset is multiclass



# training random forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# predicting values
rf_pred = rf_model.predict(X_test)

# calculating f1 score
rf_f1 = f1_score(y_test, rf_pred, average="macro")



# comparing f1 scores
print("Decision Tree F1 Score:", round(dt_f1, 4))
print("Random Forest F1 Score:", round(rf_f1, 4))


Decision Tree F1 Score: 0.9425
Random Forest F1 Score: 1.0


Decision tree classified most samples correctly but having single tree can slighly overfit the model or miss complex pattern whereas  

Random Forest classified all the samples correctly as combining many trees that
too with random number of selected features and combining them at the last helps reducing the overfit and error caused by decision tree.

<br>

**2. Hyperparameter tuning**

In [21]:
# importing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


# creating base random forest
rf_base = RandomForestClassifier(random_state=42)

# creating parameter grid
param_grid = {
    "n_estimators": [50, 100, 200], # n_estimators = number of decision trees used in random forest #larger number = larger average = more stable
    "max_depth": [None, 5, 10, 20], # None = no limit on the depth (keeps splitting until pure node)
    "min_samples_split": [2, 5, 10] # min_sample_split -> no of rows in a node after split less than this bhayesi don't train as the model will memorize more train data and will overfit #It's a early stopping condition to prevent overfitting
}

# applying grid search
grid = GridSearchCV(
    estimator=rf_base, #estimator = the model that we want to tune
    param_grid=param_grid,
    scoring="f1_macro", #f1_macro = macro average of f1 score
    cv=5, # Performing k-fold cross validation to reduce overfitting as model has to do well on all the folds and single average of 5 is calculated as output
)

# fitting grid search in the training data
grid.fit(X_train, y_train)

# printing best result
print("Best Params:", grid.best_params_)
print("Best CV F1 Score:", round(grid.best_score_, 4))




Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best CV F1 Score: 0.9788


<br>

**3. Implementing Regression Model**

In [22]:
# importing
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# loading wine dataset
wine = load_wine()

print(wine.feature_names)
print(wine.data)


['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [23]:
# creating dataframe
df_wine = pd.DataFrame(wine.data, columns=wine.feature_names)

# creating regression target (output: label)
y = df_wine["alcohol"].values

# creating input features dataset
X = df_wine.drop(columns=["alcohol"]).values

# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# training decision tree regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

# predicting values
dt_pred = dt_reg.predict(X_test)

# checking r2 score
dt_r2 = r2_score(y_test, dt_pred)

print("Decision Tree R2 Score:", round(dt_r2, 4))


# training random forest regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

# predicting values
rf_pred = rf_reg.predict(X_test)

# checking r2 score
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest R2 Score:", round(rf_r2, 4))

Decision Tree R2 Score: 0.4775
Random Forest R2 Score: 0.7416


Identifying 3 params for Random Forest Regression and performing hyperparamter tuning

In [24]:
# defining parameter distribution
param_dist = {
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 15]
}

# creating base model
rf_base = RandomForestRegressor(random_state=42)

# applying randomized search
random_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_dist,
    n_iter=20,
    scoring="r2", # Scoring = best metric to decide best paramters
    cv=5,
    random_state=42
)

# fitting randomized search in traing data
random_search.fit(X_train, y_train)

# showing best parameters
print("Best Params:", random_search.best_params_)
print("Best CV R2 Score:", round(random_search.best_score_, 4))


Best Params: {'n_estimators': 500, 'min_samples_split': 15, 'max_depth': 5}
Best CV R2 Score: 0.5119
