O objetivo deste trabalho é comparar diversos métodos de classificação para a base de dados de qualidade de vinhos disponível em https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv.

Vocês devem encontrar um bom modelo preditivo, variando:
* o número e conjunto de features (atributos) utilizados
* o método utilizado
* a configuração do algoritmo correspondente (e.g.: número k para nearest neighbors, profundidade para árvore de decisão)

Vocês devem listar algumas métricas de qualidade, tais como: precision, recall, accuracy e f1_score, e utilizar accuracy como base para a avaliação final, considerando a accuracy média de 10 iterações para cada configuração.

Para assegurar que eu obterei os mesmos resultados de vocês, vocês devem estabelecer a semente para a geração dos números aleatórios (utilizados para separar os conjuntos de treinamento e teste, por exemplo), utilizando os seguintes comandos no início do seu código (podem utilizar uma outra semente):
```
import random
random.seed(1001001)
```

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

%matplotlib inline

In [4]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' ,sep=';')

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
df.quality.describe()

count    1599.000000
mean        5.636023
std         0.807569
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         8.000000
Name: quality, dtype: float64

In [6]:
df.quality.unique()

array([5, 6, 7, 4, 8, 3])

In [7]:
X = df.iloc[:, :-1].values
Y = df.iloc[:,-1].values

import random
random.seed(1001001)

from sklearn.preprocessing import normalize

X = normalize(X)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
y_pred = regressor.predict(X_test)

y_round = []
for y in y_pred:
    y_round.append(int(round(y)))

In [10]:
np.mean(Y_test == y_round)

0.63125

### SVM

In [11]:
from sklearn.svm import SVC

svm = SVC(C = 8, kernel = 'linear')
svm.fit(X_train, Y_train)

SVC(C=8, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
svm.score(X_test, Y_test)

0.559375

### K-NN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 500).fit(X_train, Y_train)

In [14]:
knn.score(X_test, Y_test)

0.528125

In [15]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=60)
clf.fit(X_train, Y_train)   

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=60, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [16]:
clf.score(X_test, Y_test)

0.65

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
dt.score(X_test, Y_test)

0.6125

In [19]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, Y_train)
gnb_predictions = gnb.predict(X_test)

gnb.score(X_test, Y_test)

0.425

### Neural Network

In [20]:
D = X.shape[1] # input dim
M = 11 # hidden size
K = 6 # number classes

In [21]:
T = np.zeros((Y.size, K))
for i in range(Y.size):
    T[i, Y[i]-3] = 1

T_train = np.zeros((Y_train.size, K))
for i in range(Y_train.size):
    T_train[i, Y_train[i]-3] = 1
    
T_test = np.zeros((Y_test.size, K))
for i in range(Y_test.size):
    T_test[i, Y_test[i]-3] = 1

In [22]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.1))

W1 = init_weights([D, M])
b1 = init_weights([M])

W2 = init_weights([M, M])
b2 = init_weights([M])

W3 = init_weights([M, M])
b3 = init_weights([M])

Wf = init_weights([M, K])
bf = init_weights([K])

In [23]:
tfX = tf.placeholder(tf.float32, [None, D])
tfY = tf.placeholder(tf.float32, [None, K])

def forward():
    Z = tf.nn.sigmoid(tf.matmul(tfX, W1) + b1)
    Z2 = tf.nn.sigmoid(tf.matmul(Z, W2) + b2)
    Z3 = tf.nn.sigmoid(tf.matmul(Z2, W3) + b3)
    
    return tf.matmul(Z3, Wf) + bf

py_x = forward()

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY, logits=py_x))

In [24]:
train_op = tf.train.AdamOptimizer().minimize(cost)
predict_op = tf.argmax(py_x, 1)

In [25]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

from tqdm import tqdm
pbar= tqdm(range(15000))

for i in pbar:
    sess.run(train_op, feed_dict={tfX: X_train, tfY: T_train})
    pred_train = sess.run(predict_op, feed_dict={tfX: X_train, tfY: T_train})
    pred_test = sess.run(predict_op, feed_dict={tfX: X_test, tfY: T_test})
    
    if i%1000 == 0:
        pbar.set_description("acc train: {}; acc test: {}".format(np.mean(Y_train-3 == pred_train), 
                                                                  np.mean(Y_test-3 == pred_test)))

acc train: 0.6411258795934324; acc test: 0.64375: 100%|██████████| 15000/15000 [00:45<00:00, 326.56it/s] 
