In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score
)

In [2]:
# Load dataset

data_path = "../datasets/wine_development(in).csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,wine_id
0,5.8,0.31,0.32,4.5,0.024,28.0,94.0,0.98906,3.25,0.52,13.7,7,white,1198
1,6.3,0.13,0.42,1.1,0.043,63.0,146.0,0.99066,3.13,0.72,11.2,7,white,3409
2,8.2,0.3,0.44,12.4,0.043,52.0,154.0,0.99452,3.04,0.33,12.0,6,white,4789
3,9.4,0.17,0.55,1.6,0.049,14.0,94.0,0.9949,3.02,0.61,10.3,6,white,3054
4,5.3,0.3,0.16,4.2,0.029,37.0,100.0,0.9905,3.3,0.36,11.8,8,white,2812


## 1. Data Preperation

In [3]:
# Number of missing values per column
missing_values = df.isna().sum()

missing_values

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
color                   0
wine_id                 0
dtype: int64

In [4]:
numeric_columns = df.select_dtypes(include=["int64", "float64"]).columns

numeric_columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'wine_id'],
      dtype='object')

In [9]:
# drop wine id since it is only a technical key and does not provide any meaning to the training of the model.

df = df.drop(columns=["wine_id"])

### 1.2 Check for balance/imbalance of class labels

In [8]:
target = "color"
class_counts = df[target].value_counts()
class_counts


color
white    3134
red      1023
Name: count, dtype: int64

In [6]:
class_percentages = df[target].value_counts(normalize=True) * 100
class_percentages

color
white    75.390907
red      24.609093
Name: proportion, dtype: float64

### 1.3 Train-Test Split

In [13]:
# Split into features (X) and target (y)

target = "color"

X = df.drop(columns=[target])
y = df[target]

# Train/Test split (stratified)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Keep class proportions
)

# Quick check: shapes
X_train.shape, X_test.shape


((3325, 12), (832, 12))

## 2. Model Building with Pipeline

I will build a Pipeline that included a cross validation scheme and hyperparameter tuning using GridSearchCV

In [17]:
# cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Pipeline configuration

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

param_grid = [
    # Linear SVM
    {
        "svm__kernel": ["linear"],
        "svm__C": [0.1, 1, 10, 100],
        "svm__class_weight": [None, "balanced"]
    },
    # RBF (Gaussian) SVM
    {
        "svm__kernel": ["rbf"],
        "svm__C": [0.1, 1, 10, 100],
        "svm__gamma": ["scale", 0.01, 0.1, 1],
        "svm__class_weight": [None, "balanced"]
    },
    # Polynomial SVM
    {
        "svm__kernel": ["poly"],
        "svm__C": [0.1, 1, 10, 100],
        "svm__degree": [2, 3, 4],
        "svm__coef0": [0, 1],
        "svm__gamma": ["scale"],  # stabiler Start; optional erweitern
        "svm__class_weight": [None, "balanced"]
    }
]

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",          # binär: F1 für positive Klasse; ggf. anpassen
    cv=cv,
    n_jobs=-1,
    refit=True,            # nach der Suche bestes Modell auf ganz X_train fitten
    verbose=1
)


## 3. Model Training

In [18]:
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score (mean F1):", grid.best_score_)

Fitting 5 folds for each of 88 candidates, totalling 440 fits
Best params: {'svm__C': 1, 'svm__class_weight': None, 'svm__coef0': 1, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'poly'}
Best CV score (mean F1): 0.9943093677927208


The best performing model was a polynomial SVM with degree 2. This indicates that wine color is not linearly separable in the original feature space, but becomes almost perfectly separable when quadratic feature interactions are considered. This suggests that interactions between chemical properties (e.g. alcohol × acidity, sugar × density) are decisive for distinguishing red and white wines.

In [19]:
# Evaluierung auf Testdaten

y_pred = grid.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1:", f1_score(y_test, y_pred, pos_label="red"))  # pos_label ggf. prüfen
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

best_model = grid.best_estimator_


Test Accuracy: 0.9963942307692307
Test F1: 0.9926650366748166

Confusion matrix:
 [[203   2]
 [  1 626]]

Classification report:
               precision    recall  f1-score   support

         red       1.00      0.99      0.99       205
       white       1.00      1.00      1.00       627

    accuracy                           1.00       832
   macro avg       1.00      0.99      1.00       832
weighted avg       1.00      1.00      1.00       832

