# Prática de Aprendizado Supervisionado

**Importando bibliotecas e funções**

In [65]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io.arff import loadarff
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt 
import time
import warnings

x_val = StratifiedKFold(n_splits=5)

warnings.filterwarnings("ignore")

**Lendo o arquivo**

In [36]:
def leitura(dataset, nome):

    # Carrega o .arff
    raw_data = loadarff('datasets/extracted/%s/%s.arff' % (dataset, nome))
    # Transforma o .arff em um Pandas Dataframe
    return pd.DataFrame(raw_data[0])
    # Imprime o Dataframe com suas colunas

**Separando em Conjunto de Treino e Teste**

In [37]:
# Com o iloc voce retira as linhas e colunas que quiser do Dataframe, no caso aqui sem as classes
def treinoTeste(df):

    X = df.iloc[:, 0:-1].values

    # Aqui salvamos apenas as classes agora
    y = df['class']
    # Substituimos os valores binários por inteiro
    bow = []
    int_value = 0
    y_aux = []
    for i in y:
        if i in bow:
            y_aux.append(int_value)
        else:
            bow.append(i)
            int_value += 1
            y_aux.append(int_value)
    # Novo y
    y = y_aux
    
    return train_test_split(X,y,test_size=0.2,random_state=327, stratify=y)

**Padronizando os dados com Técnicas de Normalização**

In [38]:
def normalizar(X_train, X_test, selectedNormalization):

    if selectedNormalization == 0:
        return X_train, X_test
    if selectedNormalization == 1:
        scaler = preprocessing.MinMaxScaler()
    if selectedNormalization == 2:
        scaler = preprocessing.StandardScaler()
    if selectedNormalization == 3:
        scaler = preprocessing.MaxAbsScaler()
    if selectedNormalization == 4:
        scaler = preprocessing.RobustScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test

**Treinando os Classificadores**

In [41]:
def treinarClassificadores(classificador, X_train, y_train,):
        
    clsfcd = classificador
    clsfcd.fit(X_train, y_train)
    
    x_val_weighted_f1score = cross_val_score(clsfcd, X_train, y_train, cv=x_val, 
                                             scoring='f1_weighted', n_jobs=-1)
    x_val_balanced_accuracy = cross_val_score(clsfcd, X_train, y_train, cv=x_val, 
                                              scoring='balanced_accuracy', n_jobs=-1)
        
    return x_val_balanced_accuracy.mean(), x_val_balanced_accuracy.std(), x_val_weighted_f1score.mean(), x_val_weighted_f1score.std()

**Testando o Conjunto de Teste**

In [47]:
def grid(dataset):
    
    normDic = {'0': 'não aplicado', '1': 'MinMaxScaler', '2': 'StandardScaler',
               '3': 'MaxAbsScaler', '4': 'RobustScaler'}
    
    extratores = ['FCTH', 'Gabor', 'GCH', 'LBP', 'LCH', 'Moments', 'PHOG', 'Tamura']
    
    nomes = ['Extrator', 'Normalizador', 'Classificador', 'Acurácia balanceada', 'Desvio acurácia',
             'F1 Score ponderado', 'Desvio F1']
    
    classificadores = [(GaussianNB(), 'Gaussian Naive Bayes'), 
                       (LogisticRegression(), 'Logistic Regression'), 
                       (DecisionTreeClassifier(), 'Decision Tree'), 
                       (KNeighborsClassifier(n_neighbors = 3), 'KNN'),
                       (LinearDiscriminantAnalysis(), 'Linear Discriminant Analysis'), 
                       (SVC(), 'SVM'), 
                       (RandomForestClassifier(random_state=42), 'Random Forest'), 
                       (MLPClassifier(alpha=1), 'MLP')]
    
    analise = []
    
    for extrator in extratores:
        
        df = leitura(dataset, extrator)
        
        for norm in range(5):
            X_train, X_test, y_train, y_test = treinoTeste(df)
            X_train, X_test = normalizar(X_train, X_test, norm)
            
            for cls, nome in classificadores:
                dados = treinarClassificadores(cls, X_train, y_train)
                bal_acc_mean, bal_acc_std, w_f1_mean, x_f1_std = dados
            
                resultados = [extrator, normDic[str(norm)], nome, bal_acc_mean, bal_acc_std, w_f1_mean, x_f1_std]
            
                analise.append(resultados)
            
    return pd.DataFrame(analise, columns=nomes).set_index(['Extrator', 'Normalizador', 'Classificador'])

In [52]:
def selecao(resultados):
    
    acc = (resultados['Acurácia balanceada'] + resultados['F1 Score ponderado']) / 2
    
    return pd.DataFrame(resultados.iloc[acc.argsort()[::-1], :])

In [49]:
shapes = grid('Shapes')

In [53]:
melhorShapes = selecao(shapes)

In [270]:
dez_melhores = melhorShapes.head(10)
dez_melhores

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Acurácia balanceada,Desvio acurácia,F1 Score ponderado,Desvio F1
Extrator,Normalizador,Classificador,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LBP,não aplicado,Random Forest,0.9625,0.024296,0.961787,0.024998
LBP,RobustScaler,Random Forest,0.9625,0.024296,0.961787,0.024998
LBP,MaxAbsScaler,Random Forest,0.9625,0.024296,0.961787,0.024998
LBP,MinMaxScaler,Random Forest,0.9625,0.024296,0.961787,0.024998
Tamura,StandardScaler,SVM,0.958333,0.034861,0.957751,0.035833
Tamura,RobustScaler,SVM,0.958333,0.034861,0.957751,0.035833
LBP,StandardScaler,Random Forest,0.958333,0.029463,0.957648,0.03014
LBP,RobustScaler,MLP,0.954167,0.024296,0.954305,0.033168
LBP,MaxAbsScaler,SVM,0.954167,0.054962,0.953765,0.055473
LBP,MinMaxScaler,SVM,0.954167,0.054962,0.953765,0.055473


In [62]:
fruits = grid('Fruits')

In [63]:
melhorFruits = selecao(fruits)

In [261]:
melhorFruits.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Acurácia balanceada,Desvio acurácia,F1 Score ponderado,Desvio F1
Extrator,Normalizador,Classificador,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FCTH,não aplicado,Random Forest,0.726509,0.02732,0.785027,0.023909
FCTH,RobustScaler,Random Forest,0.726509,0.02732,0.78501,0.02391
FCTH,MaxAbsScaler,Random Forest,0.724156,0.029858,0.784063,0.024561
FCTH,MinMaxScaler,Random Forest,0.724156,0.029858,0.784063,0.024561
FCTH,StandardScaler,Random Forest,0.717887,0.029615,0.779984,0.025708
FCTH,não aplicado,MLP,0.714004,0.037089,0.772624,0.020765
FCTH,não aplicado,KNN,0.721887,0.009353,0.751062,0.013655
LCH,StandardScaler,MLP,0.72,0.036032,0.751162,0.041936
GCH,StandardScaler,MLP,0.702329,0.033241,0.754994,0.012588
FCTH,StandardScaler,MLP,0.694676,0.019043,0.755149,0.014623


# O porquê
## Fruits

In [274]:
fruits_caracteristicas = leitura('Fruits', 'LCH')

In [275]:
fruits_caracteristicas = fruits_caracteristicas.describe().T

In [276]:
np.std(fruits_caracteristicas['max'] - fruits_caracteristicas['min'])

4488.968690560177

FCTH gera características baseado em cor e textura (192 características), as quais apresentam valores desnomalizados. Dataset Alien vs Predator tem diferentes cores e texturas entre as duas classes. Como observado acima, a variação entre as escalas das características são pequenas. Isso pode ter ocasionado a não normalização de nosso dataset

## Shapes

In [43]:
shapes_caracteristicas = leitura('Shapes', 'LBP')

In [44]:
shapes_caracteristicas = shapes_caracteristicas.describe().T
shapes_caracteristicas.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,300.0,10.973333,3.863813,3.0,8.0,11.0,13.0,21.0
2,300.0,1.156667,1.3183,0.0,0.0,1.0,2.0,8.0
3,300.0,4.186667,2.909761,0.0,2.0,3.0,6.0,15.0
4,300.0,0.206667,0.466921,0.0,0.0,0.0,0.0,2.0
5,300.0,1.043333,1.259661,0.0,0.0,1.0,2.0,5.0


In [46]:
np.std(shapes_caracteristicas['max'] - shapes_caracteristicas['min'])

11.778871169610161

O conjunto não apresenta diferença significativa de cores, umas vez que as imagens são pretas e brancas. Assim, o melhor extrator para o dataset foi o LBP o qual só gera características baseado na textura. Outra característica dos dados é que após a aplicação do LBP, as características extraidas não apresentam grande variação quanto à escala, assim a aplicação de um normalizador não foi necessária