In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, loguniform

In [37]:
# loading the dataset
df_red = pd.read_csv('./data/clean_red_wine.csv')
df_white = pd.read_csv('./data/clean_white_wine.csv')

In [38]:
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.532016,0.967994,-1.400957,-0.521653,-0.262309,-0.474408,-0.390729,0.602543,1.335338,-0.620007,-0.967074,5
1,-0.29622,1.982805,-1.400957,0.093048,0.243692,0.899216,0.6338,0.052488,-0.727227,0.152538,-0.591372,5
2,-0.29622,1.306264,-1.194756,-0.170395,0.105692,-0.081944,0.230198,0.162499,-0.328021,-0.040598,-0.591372,5
3,1.708046,-1.399896,1.485849,-0.521653,-0.285309,0.114288,0.416476,0.712554,-0.993364,-0.491249,-0.591372,6
4,-0.532016,0.742481,-1.400957,-0.609467,-0.285309,-0.278176,-0.204451,0.602543,1.335338,-0.620007,-0.967074,5


In [39]:
df_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.195694,-0.097739,0.22619,2.72393,-0.028613,0.642629,0.78427,2.311671,-1.321817,-0.360537,-1.480369,6
1,-0.640298,0.206859,0.054817,-0.908391,0.167471,-1.286476,-0.121816,0.080487,0.711655,0.003191,-0.900805,6
2,1.509395,0.003794,0.568936,0.214134,0.216492,-0.290809,-0.956369,0.476342,0.440526,-0.451469,-0.404035,6
3,0.434549,-0.503869,-0.116556,0.553009,0.60866,0.767088,1.165779,0.656276,-0.033951,-0.815196,-0.569625,6
4,-0.759725,0.409924,-1.487541,0.235314,-0.028613,-0.290809,-0.026439,0.404369,-0.101734,-0.178673,-0.81801,6


# **Train Test Split**

## *Train test split for 'red' wine*

In [40]:
X_red = df_red.drop(columns=['quality'])
y_red = df_red['quality']
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_red, y_red, random_state=42, test_size=30)

In [41]:
# importing all the models
svc_red = SVC()
knn_red = KNeighborsClassifier()
svc_white = SVC()
knn_white = KNeighborsClassifier()

## *Train test split for 'white' wine*

In [42]:
X_white = df_white.drop(columns=['quality'])
y_white = df_white['quality']
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(X_white, y_white, random_state=42, test_size=30)

# **Using `SMOTE` to handle the imbalance in the dataset**

In [43]:
smote = SMOTE(random_state=42, k_neighbors=3)

## *SMOTE for 'red' wine*

In [44]:
y_train_red.head()

885     6
1015    6
316     6
208     6
591     5
Name: quality, dtype: int64

In [45]:
X_train_red_res, y_train_red_res = smote.fit_resample(X_train_red, y_train_red)

## *SMOTE for 'white' wine*

In [46]:
X_train_white_res, y_train_white_res = smote.fit_resample(X_train_white, y_train_white)

# **Fitting on models**


## *Fitting*

In [47]:
train_sets = [[X_train_red_res, y_train_red_res],[X_train_white_res, y_train_white_res]] 
test_sets = [[X_test_red, y_test_red],[X_test_white, y_test_white]]
models = [[svc_red, knn_red], [svc_white, knn_white]]
datasets = ['Red Wine', 'White Wine']

In [48]:
def fit_and_predict(train_sets, test_sets, models, datasets):
    for trains, tests, models, dataset in zip(train_sets, test_sets, models, datasets):
        for model in models:
            model.fit(trains[0], trains[1])
            y_pred = model.predict(tests[0])
            print(f'Report for {dataset} using {model}:')
            print(classification_report(tests[1], y_pred))

## *Predicting*

In [49]:
fit_and_predict(train_sets, test_sets, models, datasets)

Report for Red Wine using SVC():
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         2
           5       0.93      0.81      0.87        16
           6       0.67      0.67      0.67         9
           7       0.50      0.33      0.40         3
           8       0.00      0.00      0.00         0

    accuracy                           0.73        30
   macro avg       0.62      0.56      0.59        30
weighted avg       0.81      0.73      0.77        30

Report for Red Wine using KNeighborsClassifier():
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       1.00      0.50      0.67         2
           5       0.85      0.69      0.76        16
           6       0.44      0.44      0.44         9
           7       0.33      0.33      0.33         3
           8       0.00      0.00      0.00         0

    accuracy                           0.57     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Report for White Wine using SVC():
              precision    recall  f1-score   support

           4       0.50      0.50      0.50         2
           5       0.60      0.50      0.55        12
           6       0.42      0.42      0.42        12
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1

    accuracy                           0.40        30
   macro avg       0.30      0.28      0.29        30
weighted avg       0.44      0.40      0.42        30

Report for White Wine using KNeighborsClassifier():
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         2
           5       0.64      0.58      0.61        12
           6       0.50      0.33      0.40        12
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1

    accuracy                           0.43 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Hyperparameter Tuning: RandomizedSearchCV**

In [50]:
estimators = [SVC(random_state=42), KNeighborsClassifier()]

In [51]:
'''
General Guidance on Kernels

Linear kernel

    Good if classes are linearly separable in the feature space.  
      
    Fast, simple, less risk of overfitting.
    
    Often a good baseline kernel.
    
    Works well when you have many features relative to samples.

RBF kernel

    The most commonly used kernel.

    Can model non-linear relationships (curved boundaries).

    Flexible, but tuning C and gamma is crucial.

    Usually the default choice if you don't know the data geometry.
'''

# For my scenario, I will be using Linear and RBF Kernels

"\nGeneral Guidance on Kernels\n\nLinear kernel\n\n    Good if classes are linearly separable in the feature space.  \n      \n    Fast, simple, less risk of overfitting.\n    \n    Often a good baseline kernel.\n    \n    Works well when you have many features relative to samples.\n\nRBF kernel\n\n    The most commonly used kernel.\n\n    Can model non-linear relationships (curved boundaries).\n\n    Flexible, but tuning C and gamma is crucial.\n\n    Usually the default choice if you don't know the data geometry.\n"

In [52]:
param_distributions_svc = {
    'kernel': ['linear', 'rbf'],
    'C': loguniform(1e-4, 1e3), 
    'gamma': loguniform(1e-4, 1e3),
}

In [53]:
param_distributions_knn = {
    'n_neighbors': randint(1,30),  # how many neighbors to consider
    'weights': ['uniform', 'distance'], # uniform = equal weight, distance = closer neighbors matter more
    'p': [1, 2], # distance metric: 1 = Manhattan, 2 = Euclidean
    'leaf_size': randint(10,50), # affects tree efficiency (not accuracy much)
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # search strategy
}

In [54]:
cv = StratifiedKFold(random_state=42, n_splits=5, shuffle=True)

In [55]:
randomizedSearch_svc = RandomizedSearchCV(
    estimator= estimators[0],
    param_distributions=param_distributions_svc,
    n_iter=25,
    scoring= 'average_precision',
    cv = cv,
    n_jobs=-1,
    random_state=42,
)

In [56]:
randomizedSearch_knn = RandomizedSearchCV(
    estimator= estimators[1],
    param_distributions=param_distributions_knn,
    n_iter=25,
    scoring= 'average_precision',
    cv = cv,
    n_jobs=-1,
    random_state=42,
)

In [57]:
# for red wine
randomizedSearch_svc.fit(X_train_red_res, y_train_red_res)
randomizedSearch_knn.fit(X_train_red_res, y_train_red_res)

Traceback (most recent call last):
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/utils/_response.py", line 204, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of [3 4 5 6 7 8]

Traceback (most recent call last):
  File "/home/root123/.local/lib/python3.10/s

In [58]:
print(f'Red Wine:\nSVC best score: {randomizedSearch_svc.best_score_}')
print(f'SVC best params: {randomizedSearch_svc.best_params_}')
print(f'KNN best score: {randomizedSearch_knn.best_score_}')
print(f'KNN best params: {randomizedSearch_svc.best_params_}')

Red Wine:
SVC best score: nan
SVC best params: {'C': np.float64(0.041858227295469716), 'gamma': np.float64(451.85609510240965), 'kernel': 'linear'}
KNN best score: nan
KNN best params: {'C': np.float64(0.041858227295469716), 'gamma': np.float64(451.85609510240965), 'kernel': 'linear'}


In [59]:
# for white wine
randomizedSearch_svc.fit(X_train_white_res, y_train_white_res)
randomizedSearch_knn.fit(X_train_white_res, y_train_white_res)

Traceback (most recent call last):
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
  File "/home/root123/.local/lib/python3.10/site-packages/sklearn/utils/_response.py", line 204, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of [3 4 5 6 7 8 9]

Traceback (most recent call last):
  File "/home/root123/.local/lib/python3.10

In [60]:
print(f'White Wine:\nSVC best score: {randomizedSearch_svc.best_score_}')
print(f'SVC best params: {randomizedSearch_svc.best_params_}')
print(f'KNN best score: {randomizedSearch_knn.best_score_}')
print(f'KNN best params: {randomizedSearch_svc.best_params_}')

White Wine:
SVC best score: nan
SVC best params: {'C': np.float64(0.041858227295469716), 'gamma': np.float64(451.85609510240965), 'kernel': 'linear'}
KNN best score: nan
KNN best params: {'C': np.float64(0.041858227295469716), 'gamma': np.float64(451.85609510240965), 'kernel': 'linear'}


# **Creating Pipeline**

In [61]:
pipe = Pipeline(steps=[
    ('scale', StandardScaler),
    ('svc', SVC),
    ('knn', KNeighborsClassifier),
])