In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data/winequality-white.csv", sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## Variables selection

In [3]:
X = data.iloc[:,:-1]
X.shape

(4898, 11)

In [4]:
y = data.iloc[:,-1]
y.shape

(4898,)

In [5]:
# convert to binary problem
y_bin = np.where(y<6, 0, 1)

## Variables standardization

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
X_std = StandardScaler().fit_transform(X)

## Split the dataset

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y_bin, test_size=0.25, random_state=0)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3673, 11), (1225, 11), (3673,), (1225,))

# Train baseline model for baseline metrics

In [11]:
from sklearn import dummy, metrics 

strategies = ['stratified', 'most_frequent', 'prior', 'uniform']
constant_value = 1
dummy_score = 0.0

for strategy in strategies:
    dum_cls = dummy.DummyClassifier(strategy=strategy, constant=constant_value, random_state=0)
    dum_cls.fit(X_train, y_train)

    d_acc1 = dum_cls.score(X_test, y_test)
    y_pred = dum_cls.predict(X_test)
    d_acc2 = metrics.accuracy_score(y_test, y_pred)
    print(f"strategy: {strategy.ljust(13)} -> Accuracy={d_acc1:.3f} | {d_acc2:.3f}")
    
    if d_acc1 > dummy_score:
        dummy_score = d_acc1
        
print(f"\nLarger Accuracy: {dummy_score:.3f}")

strategy: stratified    -> Accuracy=0.529 | 0.529
strategy: most_frequent -> Accuracy=0.640 | 0.640
strategy: prior         -> Accuracy=0.640 | 0.640
strategy: uniform       -> Accuracy=0.505 | 0.505

Larger Accuracy: 0.640


# SVM Classification

In [12]:
from sklearn.svm import LinearSVC

In [13]:
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X_train, y_train)



LinearSVC(random_state=0, tol=1e-05)

In [14]:
print(clf.coef_)
print(clf.intercept_)

[[ 0.05336003 -0.24924474 -0.01626841  0.3917895  -0.01024214  0.06639033
  -0.02208839 -0.42408706  0.09468534  0.07908131  0.24508045]]
[0.36282117]


In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(f"Accuracy: {clf.score(X_test, y_test):.3f} | {metrics.accuracy_score(y_test, y_pred):.3f}")

Accuracy: 0.736 | 0.736


## Conclusion

the 0.736 score prove that this model learned (as the best Dummy Classifier gets 0.64)