In [1]:
# Wine Quality Classification using Random Forest and Decision Tree
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report



In [3]:
# 1. Read the dataset
df = pd.read_csv('winequality-red.csv', sep=';')



In [4]:
# 2. Extract Independent (X) and Dependent (y) Variables
X = df.drop('quality', axis=1)
y = df['quality']


In [6]:
# 3. Convert quality into three categories: best, average, poor
def quality_category(q):
	if q >= 7:
		return 'best'
	elif q >= 5:
		return 'average'
	else:
		return 'poor'

y_cat = y.apply(quality_category)


In [7]:
# 4. Split the dataset (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.25, random_state=42, stratify=y_cat)



In [8]:
# 5. Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [9]:
# 6. Build Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)



In [10]:
# 7. Confusion matrix for Random Forest
print('Random Forest Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_rf))



Random Forest Confusion Matrix:
[[315  15   0]
 [ 19  35   0]
 [ 16   0   0]]


In [11]:
# 7. Confusion matrix for Random Forest
print('Random Forest Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_rf))



Random Forest Confusion Matrix:
[[315  15   0]
 [ 19  35   0]
 [ 16   0   0]]


In [12]:
# 9. Decision Tree Model for comparison
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)

print('Decision Tree Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_dt))

print('\nDecision Tree Classification Report:')
print(classification_report(y_test, y_pred_dt))

Decision Tree Confusion Matrix:
[[280  41   9]
 [ 21  33   0]
 [ 11   1   4]]

Decision Tree Classification Report:
              precision    recall  f1-score   support

     average       0.90      0.85      0.87       330
        best       0.44      0.61      0.51        54
        poor       0.31      0.25      0.28        16

    accuracy                           0.79       400
   macro avg       0.55      0.57      0.55       400
weighted avg       0.81      0.79      0.80       400

