In [1]:
# Load Data
import numpy as np
from google.colab import drive

drive.mount('/content/drive')
with open('/content/drive/My Drive/Colab Notebooks/data/winequality-white.csv', 'r') as f: 
    data = np.genfromtxt(f, dtype='f4', delimiter=',',skip_header=1)
X = data[:,:-1]
y = data[:,-1]

Mounted at /content/drive


In [2]:
# partition into train and test set at 70/30 ratio, stratified
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0, shuffle=True, stratify=y)

In [3]:
# preprocessing-standardize
# Note: fit with training data only, or else data leakage into testing set!
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# sanity check data info
print(f'# features: {scaler.n_features_in_}')
print(f'mean:\n{np.mean(X_train,axis=0)}')
print(f'std:\n{np.std(X_train,axis=0)}')

# features: 11
mean:
[ 2.0395639e-08 -1.2458205e-08  3.6844295e-08 -1.6361719e-08
 -1.4034825e-07 -8.1200024e-09  1.5179362e-08 -1.1792261e-07
 -1.8778592e-09 -4.0226180e-08 -1.7474523e-08]
std:
[1.0000025  1.0000012  0.99999774 0.9999992  0.99999505 0.9999994
 0.99999964 1.0000044  1.0000024  1.         0.99999857]


In [5]:
# models
from sklearn.svm import SVC
clf_a = SVC(kernel='linear').fit(X_train,y_train)
clf_b = SVC(kernel='poly').fit(X_train,y_train)
clf_c = SVC(kernel='rbf').fit(X_train,y_train)

In [6]:
# report
# Note: zero_division param for test sample size 0
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print('Liner Kernel')
print(confusion_matrix(y_test, clf_a.predict(X_test)))
print(classification_report(y_test, clf_a.predict(X_test),zero_division=0))

print('Poly Kernel')
print(confusion_matrix(y_test, clf_b.predict(X_test)))
print(classification_report(y_test, clf_b.predict(X_test),zero_division=0))

print('Gaussian Kernel')
print(confusion_matrix(y_test, clf_c.predict(X_test)))
print(classification_report(y_test, clf_c.predict(X_test),zero_division=0))

Liner Kernel
[[  0   0   1   5   0   0   0]
 [  0   0  27  22   0   0   0]
 [  0   0 242 195   0   0   0]
 [  0   0 120 540   0   0   0]
 [  0   0   6 258   0   0   0]
 [  0   0   1  52   0   0   0]
 [  0   0   0   1   0   0   0]]
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         6
         4.0       0.00      0.00      0.00        49
         5.0       0.61      0.55      0.58       437
         6.0       0.50      0.82      0.62       660
         7.0       0.00      0.00      0.00       264
         8.0       0.00      0.00      0.00        53
         9.0       0.00      0.00      0.00         1

    accuracy                           0.53      1470
   macro avg       0.16      0.20      0.17      1470
weighted avg       0.41      0.53      0.45      1470

Poly Kernel
[[  0   1   1   4   0   0   0]
 [  0   9  25  15   0   0   0]
 [  0   5 199 230   2   1   0]
 [  0   3  90 550  17   0   0]
 [  0   0   9 215  38   2   0]
 [  0

In [7]:
drive.flush_and_unmount()