In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
FILE = "../data/raw/database0001.csv"

def load_data(file_path="../data/raw/database0001.csv") -> pd.DataFrame:
    return pd.read_csv(file_path)


In [4]:
wine = load_data()
wine.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,10.1,0.45,0.23,1.9,0.082,10.0,18.0,0.99774,3.22,0.65,9.3,6
1,15.0,0.21,0.44,2.2,0.075,10.0,24.0,1.00005,3.07,0.84,9.2,7
2,8.1,0.725,0.22,2.2,0.072,11.0,41.0,0.9967,3.36,0.55,9.1,5
3,8.1,0.575,0.22,2.1,0.077,12.0,65.0,0.9967,3.29,0.51,9.2,5
4,7.9,0.65,0.01,2.5,0.078,17.0,38.0,0.9963,3.34,0.74,11.7,7


In [5]:
# wine.describe().transpose()

In [6]:
# wine.hist(figsize=(20,15))
# plt.show()

In [7]:
# wine['residual sugar'].apply(np.log).plot.hist()
# plt.show()

In [8]:
# wine['chlorides'].apply(np.log).plot.hist()
# plt.show()

In [9]:
# wine['total sulfur dioxide'].apply(np.log).plot.hist()
# plt.show()

# Stratificação

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42,
)
for train_index, test_index in split.split(wine, wine['quality']):
    train_set = wine.loc[train_index]
    test_set = wine.loc[test_index]

In [11]:
train_set = train_set[train_set.quality != 3]
test_set = test_set[test_set.quality != 3]

X_train, Y_train = train_set.drop(columns="quality"), train_set.quality
X_test, Y_test = test_set.drop(columns="quality"), test_set.quality


# Pipeline0001

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

tsd, chl, rs, qual = [wine.columns.get_loc(i) for i in ['total sulfur dioxide', 'chlorides', 'residual sugar', 'quality']] 

class LogAtt(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_l = X
        X_l[:, tsd] = np.log(X[:,tsd])
        X_l[:, chl] = np.log(X[:,chl])
        X_l[:, rs] = np.log(X[:,rs])

        return X_l

pipe = LogAtt()

# Testando modelos de forma superficial

In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Y_train_np = Y_train.to_numpy()
X_train_pp = pipe.transform(X_train.values)
CROSS_VAL = 4


In [20]:
svc_clf = SVC(random_state=RANDOM_SEED)
svc_clf.fit(X_train_pp, Y_train_np)
svc_clf_pred = cross_val_predict(svc_clf, X_train_pp, Y_train_np, cv=CROSS_VAL)

confusion_matrix(Y_train_np, svc_clf_pred),  accuracy_score(Y_train_np, svc_clf_pred)


(array([[ 0,  5,  3,  0,  0],
        [ 0, 69, 47,  0,  0],
        [ 0, 45, 65,  0,  0],
        [ 0,  4, 30,  0,  0],
        [ 0,  0,  4,  0,  0]]),
 0.49264705882352944)

In [21]:
sgd_clf = SGDClassifier(random_state=RANDOM_SEED)
sgd_clf.fit(X_train_pp, Y_train_np)
sgd_clf_pred = cross_val_predict(sgd_clf, X_train_pp, Y_train_np, cv=CROSS_VAL)

confusion_matrix(Y_train_np, sgd_clf_pred),  accuracy_score(Y_train_np, sgd_clf_pred)

(array([[  0,   7,   0,   1,   0],
        [  0, 110,   5,   1,   0],
        [  0,  80,  20,  10,   0],
        [  0,  13,  13,   8,   0],
        [  0,   1,   2,   1,   0]]),
 0.5073529411764706)

In [28]:
rf_clf = RandomForestClassifier(random_state=RANDOM_SEED)
rf_clf.fit(X_train_pp, Y_train_np)
rf_clf_pred = cross_val_predict(rf_clf, X_train_pp, Y_train_np, cv=CROSS_VAL)

confusion_matrix(Y_train_np, rf_clf_pred,),  accuracy_score(Y_train_np, rf_clf_pred)


(array([[ 0,  5,  3,  0,  0],
        [ 0, 87, 28,  1,  0],
        [ 0, 29, 73,  8,  0],
        [ 0,  1, 21, 12,  0],
        [ 0,  0,  3,  1,  0]]),
 0.6323529411764706)