In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd

In [2]:
data = load_wine()
wine_df = pd.DataFrame(data.data, columns=data.feature_names)

print(wine_df.head(), wine_df.shape, sep='\n')
print("/n")

print("Target Names (Classes):", data.target_names)

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                  

# Classification Models

In [3]:
x = data.data
y = data.target # type of wine class 0 , 1, 2

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

## Decision Tree Classifier

In [4]:
classifier_decision_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
classifier_decision_tree.fit(x_train, y_train)
y_pred_decision_tree = classifier_decision_tree.predict(x_test)

In [5]:
f1_decision_tree = f1_score(y_test, y_pred_decision_tree, average="weighted")

## Random Forest Classifier

In [6]:
classifier_random_forest = RandomForestClassifier(n_estimators=100, random_state=12)
classifier_random_forest.fit(x_train, y_train)
y_pred_random_forest = classifier_random_forest.predict(x_test)

In [7]:
f1_random_forest = f1_score(y_test, y_pred_random_forest, average="weighted")

## F1-Score Comparision


In [8]:
print(f"Decision tree classifier F1 Score: {f1_decision_tree:.4f}")
print(f"Random forest classifier F1 Score: {f1_random_forest:.4f}")

Decision tree classifier F1 Score: 0.9449
Random forest classifier F1 Score: 1.0000


## Hyperparameter Tuning using GridSearchCV, Random Forest Classifier

In [15]:
grid_search = GridSearchCV(
    estimator=classifier_random_forest,
    param_grid={
        "n_estimators": [50, 100, 200, 500],
        "max_depth": [3, 5, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    scoring="f1_weighted",
    cv=4,
)
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score: {grid_search.best_score_:.4f}")

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best F1 Score: 0.9790


# Regression Model


In [10]:
x2 = wine_df.drop(columns=["alcohol"]) 
y2 = wine_df["alcohol"]  # "Alcohol" as the target

x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(
    x2, y2, test_size=0.2, random_state=42
)
print(x_train_2.head())  # Features
print(y_train_2.head())  # Target

     malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
158        1.68  2.70               25.0       98.0           2.80   
137        5.51  2.64               25.0       96.0           1.79   
98         1.07  2.10               18.5       88.0           3.52   
159        1.67  2.64               22.5       89.0           2.60   
38         1.50  2.10               15.5       98.0           2.40   

     flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
158        1.31                  0.53             2.70            13.00  0.57   
137        0.60                  0.63             1.10             5.00  0.82   
98         3.75                  0.24             1.95             4.50  1.04   
159        1.10                  0.52             2.29            11.75  0.57   
38         2.64                  0.28             1.37             3.70  1.18   

     od280/od315_of_diluted_wines  proline  
158                          1.96    660.0  
13

## Decision Tree Regressor

In [11]:
regressor_decision_tree = DecisionTreeRegressor(max_depth=3, random_state=42)
regressor_decision_tree.fit(x_train_2, y_train_2)
y_pred_decision_tree_regressor = regressor_decision_tree.predict(x_test_2)

## Random Forest Regressor

In [12]:
regressor_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
regressor_random_forest.fit(x_train_2, y_train_2)
y_pred_random_forest_regressor = regressor_random_forest.predict(x_test_2)

## Mean-Squared Error Comparision

In [13]:
mse_dt = mean_squared_error(y_test_2, y_pred_decision_tree_regressor)

mse_rf = mean_squared_error(y_test_2, y_pred_random_forest_regressor)

print(f"Decision Tree Regressor MSE: {mse_dt:.4f}")
print(f"Random Forest Regressor MSE: {mse_rf:.4f}")

print("\n")

if (mse_dt > mse_rf):
    print("mse of decision tree regressor is greater")
else:
    print("Mse of random forest regressor is greater")

Decision Tree Regressor MSE: 0.2913
Random Forest Regressor MSE: 0.1542


mse of decision tree regressor is greater


## Hyperparameter Tuning using RandomizedSearchCV, Random Forest Regressor

In [14]:
random_search = RandomizedSearchCV(
    estimator=regressor_random_forest,
    param_distributions={
        "n_estimators": [50, 100, 200, 300],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
    },
    scoring="neg_mean_squared_error",
    n_iter=50,
    cv=5,
    random_state=42,
)
random_search.fit(x_train_2, y_train_2)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Negative MSE Score: {random_search.best_score_:.4f}")

Best Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'max_features': 'log2', 'max_depth': 10}
Best Negative MSE Score: -0.3027


90 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter