In [1]:
import numpy as np

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, mean_squared_error, r2_score

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
#load dataset
wine = load_wine()
X = wine.data
y_class = wine.target

1. Implement Classification Models:



*   Train a Decision Tree Classifier and a Random Forest Classifier using scikit-learn.
*   Compare the models based on their F1 scores.




In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)

dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

dt_pred = dt_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)

dt_f1 = f1_score(y_test, dt_pred, average="macro")
rf_f1 = f1_score(y_test, rf_pred, average="macro")

print("=== Classification (Baseline) ===")
print(f"Decision Tree F1 (macro):  {dt_f1:.4f}")
print(f"Random Forest F1 (macro):  {rf_f1:.4f}")

=== Classification (Baseline) ===
Decision Tree F1 (macro):  0.9457
Random Forest F1 (macro):  1.0000


2. Hyperparameter Tuning:


*   Identify three hyperparameters of the Random Forest Classifier.
*   Perform hyperparameter tuning using GridSearchCV to optimize these parameters.

*   Take hints from the scikit-learn documentation to guide the implementation.



In [4]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "max_features": ["sqrt", "log2", None],
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=0
)

grid.fit(X_train, y_train)

best_rf_clf = grid.best_estimator_
best_pred = best_rf_clf.predict(X_test)
best_f1 = f1_score(y_test, best_pred, average="macro")

print("\n=== Classification (GridSearchCV Tuning) ===")
print("Best Params:", grid.best_params_)
print(f"Best CV F1 (macro): {grid.best_score_:.4f}")
print(f"Test F1 (macro):    {best_f1:.4f}")


=== Classification (GridSearchCV Tuning) ===
Best Params: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}
Best CV F1 (macro): 0.9863
Test F1 (macro):    1.0000


3. Implement Regression Model:


*   Train a Decision Tree Regressor and a Random Forest Regressor using scikit-learn.
*   Identify three parameters for Random Forest Regressio and Perform hyperparameter tuning using RandomSearchCV to optimize these parameters.




In [7]:
feature_names = wine.feature_names
alcohol_idx = feature_names.index("alcohol")

y_reg = X[:, alcohol_idx]
X_reg = np.delete(X, alcohol_idx, axis=1)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

dt_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(random_state=42)

dt_reg.fit(Xr_train, yr_train)
rf_reg.fit(Xr_train, yr_train)

dt_reg_pred = dt_reg.predict(Xr_test)
rf_reg_pred = rf_reg.predict(Xr_test)

dt_mse = mean_squared_error(yr_test, dt_reg_pred)
rf_mse = mean_squared_error(yr_test, rf_reg_pred)

dt_rmse = np.sqrt(dt_mse)
rf_rmse = np.sqrt(rf_mse)

dt_r2 = r2_score(yr_test, dt_reg_pred)
rf_r2 = r2_score(yr_test, rf_reg_pred)

print("\n=== Regression (Baseline) ===")
print(f"Decision Tree RMSE: {dt_rmse:.4f} | R2: {dt_r2:.4f}")
print(f"Random Forest RMSE: {rf_rmse:.4f} | R2: {rf_r2:.4f}")


=== Regression (Baseline) ===
Decision Tree RMSE: 0.5585 | R2: 0.4775
Random Forest RMSE: 0.3928 | R2: 0.7416


#### RandomizedSearchCV: choose 3 parameters for tuning
##### - n_estimators
##### - max_features
##### - min_samples_leaf

In [8]:
param_dist = {
    "n_estimators": [50, 100, 200, 400, 600],
    "max_features": ["sqrt", "log2", None],
    "min_samples_leaf": [1, 2, 4, 6, 8],
}

rand_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42,
    n_jobs=-1
)

rand_search.fit(Xr_train, yr_train)

best_rf_reg = rand_search.best_estimator_

y_pred = best_rf_reg.predict(Xr_test)

mse = mean_squared_error(yr_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(yr_test, y_pred)

print("Best Parameters:", rand_search.best_params_)
print("RMSE:", rmse)
print("R2 Score:", r2)

Best Parameters: {'n_estimators': 50, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
RMSE: 0.401892209152874
R2 Score: 0.7294679027458708
