In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)

import matplotlib.pyplot as plt

RANDOM_STATE = 42
TEST_SIZE = 0.2

In [None]:

cols = [
    "fixed acidity",
    "volatile acidity",
    "citric acid",
    "residual sugar",
    "chlorides",
    "free sulfur dioxide",
    "total sulfur dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
    "quality",
]

df = pd.read_csv("~/Downloads/winequality-red.csv", sep=";", skiprows=1, header=None, names=cols)

print("First few rows:")
display(df.head())

print("\nColumns:", df.columns.tolist())
print("\nQuality value counts:")
print(df["quality"].value_counts().sort_index())


def make_labels(y_quality: np.ndarray, mode: str = "3class"):
    """
    Map original quality scores to binary or 3-class labels.
    This matches what your teammate wrote.
    """
    yq = pd.Series(y_quality).astype(int).to_numpy()

    if mode == "binary":
        y = (yq >= 7).astype(int)
        names = ["bad(<=6)", "good(>=7)"]
    elif mode == "3class":
        # 0 = low (<=5), 1 = mid (=6), 2 = high (>=7)
        y = np.where(yq <= 5, 0, np.where(yq == 6, 1, 2))
        names = ["low(<=5)", "mid(=6)", "high(>=7)"]
    else:
        raise ValueError("mode must be 'binary' or '3class'")
    return y, names


y, class_names = make_labels(df["quality"].to_numpy(), mode="3class")
X = df.drop(columns=["quality"])

print("\nClass distribution (0=low, 1=mid, 2=high):")
print(pd.Series(y).value_counts().sort_index())
print("Class names:", class_names)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)


In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)

def evaluate_model(name, model, X_test, y_test, class_names=None):
    """
    Evaluate a fitted model on test data and print metrics.
    For 3-class, we report macro-averaged precision/recall/F1.
    """
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="macro", zero_division=0
    )
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n===== {name} =====")
    print("Accuracy        :", acc)
    print("Macro precision :", precision)
    print("Macro recall    :", recall)
    print("Macro F1-score  :", f1)
    print("Confusion matrix:\n", cm)

    print("\nClassification report:\n",
          classification_report(
              y_test, y_pred,
              target_names=class_names if class_names is not None else None,
              zero_division=0
          ))

    return {
        "model": name,
        "accuracy": acc,
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1
    }

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

svm_basic = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", random_state=RANDOM_STATE))
])

svm_basic.fit(X_train, y_train)

scores_basic = evaluate_model(
    "SVM_RBF_basic", svm_basic, X_test, y_test, class_names
)

In [None]:
from sklearn.model_selection import GridSearchCV

svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", random_state=RANDOM_STATE))
])

param_grid = {
    "svc__C": [0.1, 1, 10, 100],
    "svc__gamma": ["scale", "auto", 0.01, 0.1, 1.0]
}

grid_search = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=param_grid,
    scoring="f1_macro",   # multi-class scoring
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV macro F1:", grid_search.best_score_)

best_svm = grid_search.best_estimator_

scores_tuned = evaluate_model(
    "SVM_RBF_tuned", best_svm, X_test, y_test, class_names
)

In [None]:
svm_linear = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="linear", random_state=RANDOM_STATE))
])

svm_linear.fit(X_train, y_train)

scores_linear = evaluate_model(
    "SVM_linear_basic", svm_linear, X_test, y_test, class_names
)

In [None]:
svm_balanced = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", class_weight="balanced", random_state=RANDOM_STATE))
])

svm_balanced.fit(X_train, y_train)

scores_balanced = evaluate_model(
    "SVM_RBF_balanced", svm_balanced, X_test, y_test, class_names
)

In [None]:
results_list = [
    scores_basic,
    scores_tuned,
    scores_linear,
    scores_balanced
]

results_df = pd.DataFrame(results_list)
display(results_df)