In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv("WineQT.csv")
df


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


## Data Cleaning

In [4]:
print(df.isnull().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64


## Model

In [6]:
imputer = SimpleImputer(strategy="mean")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
scaler = StandardScaler()
X = df_imputed.drop('quality', axis=1)  
y = df_imputed['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6443


In [13]:
precision = precision_score(y_test, y_pred, average=None, labels=np.unique(y), zero_division=1)
recall = recall_score(y_test, y_pred, average=None, labels=np.unique(y), zero_division=1)
f1 = f1_score(y_test, y_pred, average=None, labels=np.unique(y), zero_division=1)

In [18]:
print("\nPrecision for each class:")
print(precision)
print("\nRecall for each class:")
print(recall)
print("\nF1-Score for each class:")
print(f1)
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=1)
print(f"\nF1-Score (Macro Average): {f1_macro:.4f}")


Precision for each class:
[1.         1.         0.66666667 0.61038961 0.6875     1.        ]

Recall for each class:
[1.         0.         0.72727273 0.64383562 0.53658537 0.25      ]

F1-Score for each class:
[1.         0.         0.69565217 0.62666667 0.60273973 0.4       ]

F1-Score (Macro Average): 0.4650


In [20]:
misclassified_indices = np.where(y_pred != y_test)[0]

# Display 5 misclassified samples
print("\n5 Misclassified Samples:")
for i in range(5):
    idx = misclassified_indices[i]
    print(f"True label: {y_test.iloc[idx]}, Predicted label: {y_pred[idx]}")
    print(X_test.iloc[idx])  # Show the feature values for the misclassified sample
    print("\n")


5 Misclassified Samples:
True label: 6.0, Predicted label: 5.0
fixed acidity             7.80000
volatile acidity          0.48000
citric acid               0.68000
residual sugar            1.70000
chlorides                 0.41500
free sulfur dioxide      14.00000
total sulfur dioxide     32.00000
density                   0.99656
pH                        3.09000
sulphates                 1.06000
alcohol                   9.10000
Id                      754.00000
Name: 538, dtype: float64


True label: 6.0, Predicted label: 5.0
fixed acidity            12.500
volatile acidity          0.600
citric acid               0.490
residual sugar            4.300
chlorides                 0.100
free sulfur dioxide       5.000
total sulfur dioxide     14.000
density                   1.001
pH                        3.250
sulphates                 0.740
alcohol                  11.900
Id                      516.000
Name: 367, dtype: float64


True label: 8.0, Predicted label: 6.0
fixed acidit