Submitted by: Sameer Dahal

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from scipy.stats import randint

3 Exercise - Ensemble Methods and Hyperparameter Tuning.

Task 1

In [2]:
# Classification Models (Wine Dataset)
# Train a Decision Tree Classifier + Random Forest Classifier
# Compare using F1 score

# Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
dt_f1 = f1_score(y_test, y_pred_dt, average="macro")

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_f1 = f1_score(y_test, y_pred_rf, average="macro")

# Results
print("=== TASK 1: CLASSIFICATION RESULTS (F1 macro) ===")
print("Decision Tree Classifier F1 (macro):", round(dt_f1, 4))
print("Random Forest Classifier  F1 (macro):", round(rf_f1, 4))

print("\n--- Classification Report: Decision Tree ---\n")
print(classification_report(y_test, y_pred_dt))

print("\n--- Classification Report: Random Forest ---\n")
print(classification_report(y_test, y_pred_rf))

results_task1 = pd.DataFrame({
    "Model": ["Decision Tree Classifier", "Random Forest Classifier"],
    "F1 (macro)": [dt_f1, rf_f1]
}).sort_values(by="F1 (macro)", ascending=False)

print("\n--- Summary Table ---")
display(results_task1)

=== TASK 1: CLASSIFICATION RESULTS (F1 macro) ===
Decision Tree Classifier F1 (macro): 0.9457
Random Forest Classifier  F1 (macro): 1.0

--- Classification Report: Decision Tree ---

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.88      1.00      0.93        14
           2       1.00      0.90      0.95        10

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36


--- Classification Report: Random Forest ---

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


--- Sum

Unnamed: 0,Model,F1 (macro)
1,Random Forest Classifier,1.0
0,Decision Tree Classifier,0.945741


Task 2

In [3]:
# Hyperparameter Tuning (Wine Dataset)
# Identify 3 hyperparameters of Random Forest Classifier
# Tune using GridSearchCV

# Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# GridSearchCV over 3 hyperparameters
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10]
}

rf_clf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
y_pred_best = best_rf.predict(X_test)
best_f1 = f1_score(y_test, y_pred_best, average="macro")

print("=== TASK 2: GRIDSEARCHCV RESULTS (Random Forest Classifier) ===")
print("Best Parameters:", grid.best_params_)
print("Best CV F1 (macro):", round(grid.best_score_, 4))
print("Test F1 (macro) with Best Model:", round(best_f1, 4))

print("\n--- Classification Report: Tuned Random Forest ---\n")
print(classification_report(y_test, y_pred_best))

results_task2 = pd.DataFrame({
    "Model": ["Random Forest Classifier (Tuned)"],
    "Best CV F1 (macro)": [grid.best_score_],
    "Test F1 (macro)": [best_f1]
})

print("\n--- Summary Table ---")
display(results_task2)

=== TASK 2: GRIDSEARCHCV RESULTS (Random Forest Classifier) ===
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best CV F1 (macro): 0.9863
Test F1 (macro) with Best Model: 1.0

--- Classification Report: Tuned Random Forest ---

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


--- Summary Table ---


Unnamed: 0,Model,Best CV F1 (macro),Test F1 (macro)
0,Random Forest Classifier (Tuned),0.98632,1.0


Task 3

In [4]:
# Regression Model (Wine Dataset)
# Train Decision Tree Regressor + Random Forest Regressor
# Identify 3 parameters for Random Forest Regressor
# Tune using RandomizedSearchCV

# Load Wine dataset
wine = load_wine()

# Create a regression target from the dataset:
# Predict "alcohol" (feature 0) using all OTHER features (avoid leakage by removing alcohol from X)
X_reg = wine.data
y_reg = wine.data[:, 0]                 # alcohol
X_reg = np.delete(X_reg, 0, axis=1)     # remove alcohol from inputs

# Train/Test split (Regression)
X_train, X_test, y_train, y_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)
pred_dt = dt_reg.predict(X_test)

dt_mse = mean_squared_error(y_test, pred_dt)
dt_mae = mean_absolute_error(y_test, pred_dt)
dt_r2  = r2_score(y_test, pred_dt)

# Random Forest Regressor (Baseline)
rf_reg = RandomForestRegressor(n_estimators=300, random_state=42)
rf_reg.fit(X_train, y_train)
pred_rf = rf_reg.predict(X_test)

rf_mse = mean_squared_error(y_test, pred_rf)
rf_mae = mean_absolute_error(y_test, pred_rf)
rf_r2  = r2_score(y_test, pred_rf)

print("=== TASK 3: REGRESSION BASELINE RESULTS ===")
print("\nDecision Tree Regressor:")
print("MSE:", round(dt_mse, 4), "| MAE:", round(dt_mae, 4), "| R2:", round(dt_r2, 4))

print("\nRandom Forest Regressor (Baseline):")
print("MSE:", round(rf_mse, 4), "| MAE:", round(rf_mae, 4), "| R2:", round(rf_r2, 4))

# RandomizedSearchCV over 3 hyperparameters for Random Forest Regressor
param_dist = {
    "n_estimators": randint(100, 600),
    "max_depth": randint(2, 30),
    "min_samples_split": randint(2, 15)
}

rand_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=30,
    scoring="neg_mean_squared_error",
    cv=5,
    random_state=42,
    n_jobs=-1
)

rand_search.fit(X_train, y_train)

best_rf_reg = rand_search.best_estimator_
pred_best = best_rf_reg.predict(X_test)

best_mse = mean_squared_error(y_test, pred_best)
best_mae = mean_absolute_error(y_test, pred_best)
best_r2  = r2_score(y_test, pred_best)

print("\n=== TASK 3: RANDOMIZEDSEARCHCV RESULTS (Random Forest Regressor) ===")
print("Best Parameters:", rand_search.best_params_)
print("Best CV Score (neg MSE):", round(rand_search.best_score_, 4))

print("\nTuned Random Forest Regressor (Test):")
print("MSE:", round(best_mse, 4), "| MAE:", round(best_mae, 4), "| R2:", round(best_r2, 4))

results_task3 = pd.DataFrame({
    "Model": ["Decision Tree Regressor", "Random Forest Regressor (Baseline)", "Random Forest Regressor (Tuned)"],
    "MSE": [dt_mse, rf_mse, best_mse],
    "MAE": [dt_mae, rf_mae, best_mae],
    "R2":  [dt_r2, rf_r2, best_r2]
}).sort_values(by="MSE")

print("\n--- Summary Table ---")
display(results_task3)

=== TASK 3: REGRESSION BASELINE RESULTS ===

Decision Tree Regressor:
MSE: 0.312 | MAE: 0.4383 | R2: 0.4775

Random Forest Regressor (Baseline):
MSE: 0.152 | MAE: 0.3194 | R2: 0.7453

=== TASK 3: RANDOMIZEDSEARCHCV RESULTS (Random Forest Regressor) ===
Best Parameters: {'max_depth': 3, 'min_samples_split': 9, 'n_estimators': 591}
Best CV Score (neg MSE): -0.3122

Tuned Random Forest Regressor (Test):
MSE: 0.1564 | MAE: 0.315 | R2: 0.7381

--- Summary Table ---


Unnamed: 0,Model,MSE,MAE,R2
1,Random Forest Regressor (Baseline),0.15204,0.319381,0.745341
2,Random Forest Regressor (Tuned),0.15637,0.315028,0.738089
0,Decision Tree Regressor,0.311972,0.438333,0.477465
