In [15]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

# Load the Wine dataset
data = pd.read_csv('/content/drive/MyDrive/datasets/winemag-data_first150k.csv')
data

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
...,...,...,...,...,...,...,...,...,...,...,...
150925,150925,Italy,Many people feel Fiano represents southern Ita...,,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Feudi di San Gregorio
150926,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,91,27.0,Champagne,Champagne,,Champagne Blend,H.Germain
150927,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,91,20.0,Southern Italy,Fiano di Avellino,,White Blend,Terredora
150928,150928,France,"A perfect salmon shade, with scents of peaches...",Grand Brut Rosé,90,52.0,Champagne,Champagne,,Champagne Blend,Gosset


In [14]:

X = data.data
y = data.target

# Split the dataset for classification
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Classification Models
# Train Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train_clf, y_train_clf)
dt_clf_pred = dt_clf.predict(X_test_clf)
dt_f1 = f1_score(y_test_clf, dt_clf_pred, average='weighted')

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_clf, y_train_clf)
rf_clf_pred = rf_clf.predict(X_test_clf)
rf_f1 = f1_score(y_test_clf, rf_clf_pred, average='weighted')

print("Decision Tree Classifier F1 Score:", dt_f1)
print("Random Forest Classifier F1 Score:", rf_f1)

# 2. Hyperparameter Tuning for Random Forest Classifier
param_grid_clf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search_clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_clf, cv=5, scoring='f1_weighted')
grid_search_clf.fit(X_train_clf, y_train_clf)
print("Best Hyperparameters for Random Forest Classifier:", grid_search_clf.best_params_)

# Split the dataset for regression
# For simplicity, create a synthetic regression target
y_reg = X[:, 0] + X[:, 1] * 0.5 + np.random.normal(scale=0.1, size=X.shape[0])
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# 3. Regression Models
# Train Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train_reg, y_train_reg)
dt_reg_pred = dt_reg.predict(X_test_reg)
dt_mse = mean_squared_error(y_test_reg, dt_reg_pred)
dt_r2 = r2_score(y_test_reg, dt_reg_pred)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train_reg, y_train_reg)
rf_reg_pred = rf_reg.predict(X_test_reg)
rf_mse = mean_squared_error(y_test_reg, rf_reg_pred)
rf_r2 = r2_score(y_test_reg, rf_reg_pred)

print("Decision Tree Regressor MSE:", dt_mse, "R2:", dt_r2)
print("Random Forest Regressor MSE:", rf_mse, "R2:", rf_r2)

# Hyperparameter Tuning for Random Forest Regressor
param_dist_reg = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4]
}
random_search_reg = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_dist_reg, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_reg.fit(X_train_reg, y_train_reg)
print("Best Hyperparameters for Random Forest Regressor:", random_search_reg.best_params_)


Decision Tree Classifier F1 Score: 0.9439974457215836
Random Forest Classifier F1 Score: 1.0
Best Hyperparameters for Random Forest Classifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Decision Tree Regressor MSE: 0.054398487537164 R2: 0.9460333014070909
Random Forest Regressor MSE: 0.03396513184452039 R2: 0.9663044669822963
Best Hyperparameters for Random Forest Regressor: {'n_estimators': 50, 'min_samples_leaf': 1, 'max_features': 'sqrt'}


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea