In [None]:
# ==========================================================
# 💖🍷 WINE QUALITY CLASSIFIER – Binary Classification
# ==========================================================
# Author: Tenika Powell | GitHub: Nikkilabesf
# Goal: Predict if a wine is "Good" (>=7) or "Not Good" (<7)
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from math import pi
import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=';')
print("✅ Data loaded successfully. Shape:", df.shape)

df["label"] = (df["quality"] >= 7).astype(int)
df.drop("quality", axis=1, inplace=True)
print("\nClass Distribution (0 = Not Good, 1 = Good):")
print(df["label"].value_counts())

df["acidity_ratio"] = df["fixed acidity"] / (df["volatile acidity"] + 1e-6)
df["sugar_sulphate_ratio"] = df["residual sugar"] / (df["sulphates"] + 1e-6)
df["alcohol_density_ratio"] = df["alcohol"] / (df["density"] + 1e-6)
df["ph_alcohol_interact"] = df["pH"] * df["alcohol"]

X = df.drop("label", axis=1)
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_scaled, y_train)
print("\n✅ Balanced classes after SMOTE:", np.bincount(y_res))

knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': range(3, 21, 2),
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski'],
    'p': [1, 2]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_knn = GridSearchCV(knn, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_knn.fit(X_res, y_res)

best_knn = grid_knn.best_estimator_
print("\n🎯 Best KNN Params:", grid_knn.best_params_)
print("🏆 Best CV Accuracy:", grid_knn.best_score_)

y_pred_knn = best_knn.predict(X_test_scaled)
knn_acc = accuracy_score(y_test, y_pred_knn)
print(f"\n✅ KNN Test Accuracy: {knn_acc:.3f}")
print("\nClassification Report (KNN):\n", classification_report(y_test, y_pred_knn))

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_params = {
    'n_estimators': [200, 400, 600],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.03, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid_xgb = GridSearchCV(xgb, xgb_params, scoring='accuracy', cv=cv, n_jobs=-1, verbose=1)
grid_xgb.fit(X_res, y_res)

best_xgb = grid_xgb.best_estimator_
print("\n⚡ Best XGBoost Params:", grid_xgb.best_params_)
print("🏆 Best CV Accuracy:", grid_xgb.best_score_)

y_pred_xgb = best_xgb.predict(X_test_scaled)
xgb_acc = accuracy_score(y_test, y_pred_xgb)
print(f"\n⚡ XGBoost Test Accuracy: {xgb_acc:.3f}")
print("\nClassification Report (XGBoost):\n", classification_report(y_test, y_pred_xgb))

y_prob_knn = best_knn.predict_proba(X_test_scaled)[:, 1]
y_prob_xgb = best_xgb.predict_proba(X_test_scaled)[:, 1]

fpr_knn, tpr_knn, _ = roc_curve(y_test, y_prob_knn)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
roc_auc_knn = auc(fpr_knn, tpr_knn)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)



✅ Data loaded successfully. Shape: (1599, 12)

Class Distribution (0 = Not Good, 1 = Good):
label
0    1382
1     217
Name: count, dtype: int64

✅ Balanced classes after SMOTE: [1105 1105]

🎯 Best KNN Params: {'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
🏆 Best CV Accuracy: 0.9176470588235294

✅ KNN Test Accuracy: 0.856

Classification Report (KNN):
               precision    recall  f1-score   support

           0       0.96      0.87      0.91       277
           1       0.48      0.77      0.59        43

    accuracy                           0.86       320
   macro avg       0.72      0.82      0.75       320
weighted avg       0.90      0.86      0.87       320

Fitting 5 folds for each of 108 candidates, totalling 540 fits
