In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the Wine Quality datasets
url_red = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
url_white = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

In [3]:
red_wine_data = pd.read_csv(url_red, sep=';')
white_wine_data = pd.read_csv(url_white, sep=';')

In [4]:
# Add a 'wine_type' column to distinguish red and white wines
red_wine_data['wine_type'] = 'red'
white_wine_data['wine_type'] = 'white'

In [5]:
# Combine the red and white wine datasets
wine_data = pd.concat([red_wine_data, white_wine_data], axis=0)

In [6]:
# Data Preprocessing

# Separate features (X) and target variable (y)
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']

In [7]:
# Feature engineering (example: adding a new feature - total acidity)
X['total_acidity'] = X['fixed acidity'] + X['volatile acidity']

In [8]:
# Encode categorical features (wine_type)
X = pd.get_dummies(X, columns=['wine_type'], drop_first=True)

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)

In [12]:
# Grid search for hyperparameter tuning
param_grid = {
    'n_estimators': [100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)



In [13]:
# Use the best model from grid search
best_clf = grid_search.best_estimator_

In [14]:
# Make predictions on the test set
y_pred = best_clf.predict(X_test)

In [15]:
# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6653846153846154

Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.75      0.14      0.24        43
           5       0.68      0.72      0.70       402
           6       0.65      0.75      0.69       597
           7       0.67      0.52      0.58       215
           8       0.86      0.33      0.48        36
           9       0.00      0.00      0.00         1

    accuracy                           0.67      1300
   macro avg       0.52      0.35      0.38      1300
weighted avg       0.67      0.67      0.65      1300


Confusion Matrix:
[[  0   0   2   4   0   0   0]
 [  0   6  19  18   0   0   0]
 [  0   1 290 106   5   0   0]
 [  0   1 111 446  38   1   0]
 [  0   0   2 101 111   1   0]
 [  0   0   0  13  11  12   0]
 [  0   0   0   0   1   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
