In [26]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [27]:
data = load_wine()

print("*** Dataset Description ***")
print(data.DESCR)

print("\n*** Feature Names ***")
print(data.feature_names)

print("\n*** Target Names ***")
print(data.target_names)

print("\n*** Data Shape ***")
print(f"Features Shape: {data.data.shape}")
print(f"Target Shape: {data.target.shape}")

print("\n*** First 5 Rows of Features ***")
print(data.data[:5])

print("\n****First 5 Target Values***")
print(data.target[:5])

*** Dataset Description ***
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:               

# Classification Models

In [28]:
x = data.data
y = data.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

## Decision Tree Classifier

In [29]:
classifier_decision_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
classifier_decision_tree.fit(x_train, y_train)
y_pred_decision_tree = classifier_decision_tree.predict(x_test)

In [30]:
f1_decision_tree = f1_score(y_test, y_pred_decision_tree, average="weighted")

## Random Forest Classifier

In [31]:
classifier_random_forest = RandomForestClassifier(n_estimators=100, random_state=12)
classifier_random_forest.fit(x_train, y_train)
y_pred_random_forest = classifier_random_forest.predict(x_test)

In [32]:
f1_random_forest = f1_score(y_test, y_pred_random_forest, average="weighted")

## F1-Score Comparision


In [33]:
print(f"Decision tree classifier F1 Score: {f1_decision_tree:.4f}")
print(f"Random forest classifier F1 Score: {f1_random_forest:.4f}")

Decision tree classifier F1 Score: 0.9449
Random forest classifier F1 Score: 1.0000


## Hyperparameter Tuning using GridSearchCV, Random Forest Classifier

In [34]:
grid_search = GridSearchCV(
    estimator=classifier_random_forest,
    param_grid={
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "min_samples_split": [2, 5, 10],
    },
    scoring="f1_weighted",
    cv=5,
)
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1 Score: {grid_search.best_score_:.4f}")

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score: 0.9783


# Regression Model


In [35]:
x2 = data.data
y2 = data.data[:, 0]

x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(
    x2, y2, test_size=0.2, random_state=42
)

## Decision Tree Regressor

In [36]:
regressor_decision_tree = DecisionTreeRegressor(max_depth=3, random_state=42)
regressor_decision_tree.fit(x_train_2, y_train_2)
y_pred_decision_tree_regressor = regressor_decision_tree.predict(x_test_2)

## Random Forest Regressor

In [37]:
regressor_random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
regressor_random_forest.fit(x_train_2, y_train_2)
y_pred_random_forest_regressor = regressor_random_forest.predict(x_test_2)

## Mean-Squared Error Comparision

In [38]:
mse_dt = mean_squared_error(y_test_reg, y_pred_dt_reg)

mse_rf = mean_squared_error(y_test_reg, y_pred_rf_reg)

print(f"Decision Tree Regressor MSE: {mse_dt:.4f}")
print(f"Random Forest Regressor MSE: {mse_rf:.4f}")

Decision Tree Regressor MSE: 0.0104
Random Forest Regressor MSE: 0.0011


## Hyperparameter Tuning using GridSearchCV, Random Forest Regressor

In [39]:
random_search = RandomizedSearchCV(
    estimator=regressor_random_forest,
    param_distributions={
        "n_estimators": [50, 100, 200, 300],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
    },
    scoring="neg_mean_squared_error",
    n_iter=50,
    cv=5,
    random_state=42,
)
random_search.fit(x_train_2, y_train_2)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Negative MSE Score: {random_search.best_score_:.4f}")

90 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\USER\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': None}
Best Negative MSE Score: -0.0886
