In [3]:
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

from sklearn.multiclass import OneVsRestClassifier

from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.metrics import ( precision_score, f1_score, make_scorer,
                              recall_score, confusion_matrix )

In [4]:
df = pd.read_csv('Data/RedWine_Train.csv')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,112,28.0,56,19,75.0,17,60,998.0,316,58,98,6
1,56,615.0,0,16,89.0,16,59,9943.0,358,52,99,5
2,89,62.0,18,38,176.0,52,145,9986.0,316,88,92,5
3,89,62.0,19,39,17.0,51,148,9986.0,317,93,92,5
4,76,39.0,31,23,82.0,23,71,9982.0,352,65,97,5
...,...,...,...,...,...,...,...,...,...,...,...,...
326,64,38.0,14,22,38.0,15,25,99514.0,344,65,111,6
327,72,39.0,44,26,66.0,22,48,99494.0,33,84,115,6
328,66,725.0,2,78,73.0,29,79,9977.0,329,54,92,5
329,54,74.0,9,17,89.0,16,26,99402.0,367,56,116,6


In [5]:
#Remover as 5 linhas com nulos
df = df.dropna()

In [6]:
df_X = df.copy(deep=True).drop(columns=['quality'])
for col in df_X.columns:
    df_X[col] = preprocessing.StandardScaler().fit_transform(df_X[[col]])
display(df_X)
#
df_y = df[['quality']].copy(deep=True)
display(df_y)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.341161,-0.457947,1.600244,-0.165564,-0.150996,-0.005294,0.355984,-1.020178,0.225575,-0.118458,-0.077460
1,-0.695120,2.250893,-1.153201,-0.258197,0.064093,-0.087480,0.325817,-0.823264,0.636398,-0.358815,-0.067888
2,0.504832,-0.301047,-0.268165,0.421110,1.400723,2.871222,2.920139,-0.822317,0.225575,1.083326,-0.134894
3,0.504832,-0.301047,-0.218997,0.451987,-1.042083,2.789036,3.010638,-0.822317,0.235356,1.283624,-0.134894
4,0.032124,-0.407185,0.371027,-0.042054,-0.043452,0.487823,0.687815,-0.822405,0.577709,0.161958,-0.087032
...,...,...,...,...,...,...,...,...,...,...,...
326,-0.404222,-0.411800,-0.464840,-0.072932,-0.719448,-0.169667,-0.699845,1.148544,0.499457,0.161958,0.046981
327,-0.113325,-0.407185,1.010220,0.050579,-0.289268,0.405637,-0.006015,1.148104,-2.542591,0.923088,0.085270
328,-0.331498,2.758512,-1.054864,1.656213,-0.181724,0.980940,0.929148,-0.822515,0.352734,-0.278696,-0.134894
329,-0.767844,-0.245670,-0.710683,-0.227320,0.064093,-0.087480,-0.669678,1.146079,0.724431,-0.198577,0.094843


Unnamed: 0,quality
0,6
1,5
2,5
3,5
4,5
...,...
326,6
327,6
328,5
329,6


In [7]:
df['quality'].value_counts()

quality
5    148
6    127
7     37
4      9
8      4
3      1
Name: count, dtype: int64

In [8]:
# Crossvalidation Scores using Decision Tree
scoring = {'crossval_precision': make_scorer(precision_score, average='weighted', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='weighted', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='weighted', zero_division=0), }

clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=1234))

#
scores = cross_validate(clf, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])

crossval_precision(avg) =  0.49054213429591
crossval_precision(std) =  0.020654913615400464
crossval_recall(avg) =  0.3557575757575758
crossval_recall(std) =  0.044349951984687816
crossval_F1(avg) =  0.4016144875836389
crossval_F1(std) =  0.040680275011060564
crossval_F1(list) =  [0.41957226 0.32589047 0.43569006 0.43315814 0.3937615 ]




In [9]:
# Crossvalidation Scores using KNN
scoring = {'crossval_precision': make_scorer(precision_score, average='weighted', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='weighted', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='weighted', zero_division=0), }

clf = OneVsRestClassifier(KNeighborsClassifier())

#
scores = cross_validate(clf, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])

crossval_precision(avg) =  0.4563019112617816
crossval_precision(std) =  0.052651704021395226
crossval_recall(avg) =  0.4631701631701632
crossval_recall(std) =  0.05249963743719669
crossval_F1(avg) =  0.4451320943925313
crossval_F1(std) =  0.05421314568486414
crossval_F1(list) =  [0.43189287 0.48410449 0.52604775 0.41138921 0.37222615]




In [16]:
# Crossvalidation Scores using MLP Classifier
#y = column_or_1d(y, warn=True)
scoring = {'crossval_precision': make_scorer(precision_score, average='weighted', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='weighted', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='weighted', zero_division=0), }

clf = OneVsRestClassifier(MLPClassifier(random_state=42, max_iter=80000))
#
scores = cross_validate(clf, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])



crossval_precision(avg) =  0.5305627685451915
crossval_precision(std) =  0.012714219908741674
crossval_recall(avg) =  0.5213053613053613
crossval_recall(std) =  0.0320200653235833
crossval_F1(avg) =  0.5080487691112261
crossval_F1(std) =  0.014372681223610203
crossval_F1(list) =  [0.51675066 0.52914545 0.4869579  0.49986476 0.50752507]


In [22]:
# Crossvalidation Scores using LGBM
scoring = {'crossval_precision': make_scorer(precision_score, average='binary', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='binary', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='binary', zero_division=0), }
#
model_crossval = LGBMClassifier(objective='binary', random_state=42)
scores = cross_validate(model_crossval, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 3944, number of negative: 3959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3685
[LightGBM] [Info] Number of data points in the train set: 7903, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499051 -> initscore=-0.003796
[LightGBM] [Info] Start training from score -0.003796


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 3944, number of negative: 3959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3692
[LightGBM] [Info] Number of data points in the train set: 7903, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499051 -> initscore=-0.003796
[LightGBM] [Info] Start training from score -0.003796
[LightGBM] [Info] Number of positive: 3944, number of negative: 3959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3677
[LightGBM] [Info] Number of data points in the train set: 7903, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499051 -> initscore=-0.003796
[LightGBM] [Info] Start training from score -0.003796


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 3944, number of negative: 3959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3681
[LightGBM] [Info] Number of data points in the train set: 7903, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499051 -> initscore=-0.003796
[LightGBM] [Info] Start training from score -0.003796
[LightGBM] [Info] Number of positive: 3944, number of negative: 3960
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3676
[LightGBM] [Info] Number of data points in the train set: 7904, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498988 -> initscore=-0.004049
[LightGBM] [Info] Start training from score -0.004049
crossval_precision

In [17]:
# Crossvalidation Scores using SVC

scoring = {'crossval_precision': make_scorer(precision_score, average='weighted', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='weighted', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='weighted', zero_division=0), }

clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025, random_state=42))

#
scores = cross_validate(clf, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])

crossval_precision(avg) =  0.39289224841137405
crossval_precision(std) =  0.10413092511469119
crossval_recall(avg) =  0.4600466200466201
crossval_recall(std) =  0.04142358388559937
crossval_F1(avg) =  0.3872534625428079
crossval_F1(std) =  0.06993388696714807
crossval_F1(list) =  [0.39968486 0.25777778 0.46660038 0.38984221 0.42236208]




In [20]:
# Crossvalidation Scores using RandomForest

scoring = {'crossval_precision': make_scorer(precision_score, average='weighted', zero_division=0),
           'crossval_recall':    make_scorer(recall_score, average='weighted', zero_division=0),
           'crossval_F1':        make_scorer(f1_score, average='weighted', zero_division=0), }

clf = OneVsRestClassifier(RandomForestClassifier(random_state=42))
#
scores = cross_validate(clf, df_X, df_y, scoring=scoring, cv=5)
print('crossval_precision(avg) = ', scores['test_crossval_precision'].mean())
print('crossval_precision(std) = ', scores['test_crossval_precision'].std())
print('crossval_recall(avg) = ', scores['test_crossval_recall'].mean())
print('crossval_recall(std) = ', scores['test_crossval_recall'].std())
print('crossval_F1(avg) = ', scores['test_crossval_F1'].mean())
print('crossval_F1(std) = ', scores['test_crossval_F1'].std())
print('crossval_F1(list) = ', scores['test_crossval_F1'])



crossval_precision(avg) =  0.5203529568080596
crossval_precision(std) =  0.061223644465197444
crossval_recall(avg) =  0.5432167832167831
crossval_recall(std) =  0.05929542132469306
crossval_F1(avg) =  0.521822991446003
crossval_F1(std) =  0.07000790607593184
crossval_F1(list) =  [0.40012796 0.55342613 0.50071332 0.60958419 0.54526336]


In [26]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

rng = np.random.RandomState(2)

figure = plt.figure(figsize=(12, 4))
i = 1
# iterate over datasets
# preprocess dataset, split into training and test part
X_train, X_test, y_train, y_test = train_test_split(
    df_X, df_y, test_size=0.4, random_state=42
)

x_min, x_max = df_X[:, 0].min() - 0.5, df_X[:, 0].max() + 0.5
y_min, y_max = df_X[:, 1].min() - 0.5, df_X[:, 1].max() + 0.5

# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
    ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
# Plot the testing points
ax.scatter(
    X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())
i += 1

# iterate over classifiers
for name, clf in zip(names, classifiers):
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    DecisionBoundaryDisplay.from_estimator(
        clf, df_X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
    )

    # Plot the training points
    ax.scatter(
        X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
    )
    # Plot the testing points
    ax.scatter(
        X_test[:, 0],
        X_test[:, 1],
        c=y_test,
        cmap=cm_bright,
        edgecolors="k",
        alpha=0.6,
    )

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    if ds_cnt == 0:
        ax.set_title(name)
    ax.text(
        x_max - 0.3,
        y_min + 0.3,
        ("%.2f" % score).lstrip("0"),
        size=15,
        horizontalalignment="right",
    )
    i += 1

plt.tight_layout()
plt.show()

InvalidIndexError: (slice(None, None, None), 0)

<Figure size 1200x400 with 0 Axes>