In [None]:
import numpy as np
import pandas as pd

import itertools
from collections import defaultdict

from sklearn.tree import DecisionTreeClassifier

## Funções básicas

Normaliza, divide_treino_teste e calcula moda

In [None]:
def normaliza(array):
    mean = np.average(array)
    std = np.std(array)

    array_padronizado = (array - mean)/std

    return array_padronizado

def divide_treino_teste(x, y, tamanho_treino=0.8):

    n = x.shape[0]
    q_treino = int(n * tamanho_treino)

    rng = np.random.default_rng()
    indices = rng.permutation(n)
    idx_treino = indices[:q_treino]
    idx_teste = indices[q_treino:]

    x_treino = x[idx_treino]
    x_teste = x[idx_teste]
    y_treino = y[idx_treino]
    y_teste = y[idx_teste]

    x_treino = np.concatenate([np.ones((q_treino, 1)), x_treino], axis=1)
    x_teste = np.concatenate([np.ones((n - q_treino, 1)), x_teste], axis=1)

    return x_treino, x_teste, y_treino, y_teste

def moda(array):

    valores, counts = np.unique(array, return_counts=True)
    pos_max = np.argmax(counts)
    return valores[pos_max]


### Mostra as coisas bonitinhas

In [None]:
def organiza_texto(texto_dict:dict):
    for k, v in texto_dict.items():
        print('---'*20)
        print(f'Parâmetros: {k}')
        print(f"Acuracia(Media e DP): {v['Acuracia']}\nRecall(Media e DP): {v['Recall']}\nPrecision(Media e DP): {v['Precision']}\nF1-Score(Media e DP): {v['F1-Score']}")

## Métricas de avalição

Accuracy, Recall, Precision e F1-Score

In [None]:
class Metricas:

    def report_geral(self, y_pred, y_verdadeiro):
        # retorna tudo
        acuracia = self.acuracia_global(y_pred, y_verdadeiro)
        recall = self.recall(y_pred, y_verdadeiro)
        precision = self.precision(y_pred, y_verdadeiro)
        f1_score = self.f1_score(y_pred, y_verdadeiro)

        return [acuracia, recall, precision, f1_score]

    def acuracia_global(self, y_pred, y_verdadeiro):
        resultado = np.where(y_pred == y_verdadeiro, 1, 0)
        acertos = np.count_nonzero(resultado)
        acuracia = (acertos/resultado.shape[0])
        return acuracia

    def recall(self, y_pred, y_verdadeiro):
        count_vp = 0
        count_fn = 0
        for i in range(y_pred.shape[0]):
            if y_pred[i] == 1 and y_verdadeiro[i] == 1:
                count_vp +=1
            elif y_pred[i] == 0 and y_verdadeiro[i] == 1:
                count_fn +=1

        return count_vp / (count_vp+count_fn)

    def precision(self, y_pred, y_verdadeiro):
        count_vp = 0
        count_fp = 0
        for i in range(y_pred.shape[0]):
            if y_pred[i] == 1 and y_verdadeiro[i] == 1:
                count_vp+=1
            elif y_pred[i] == 1 and y_verdadeiro[i] == 0:
                count_fp+=1

        return count_vp / (count_vp+count_fp)

    def f1_score(self, y_pred, y_verdadeiro):
        recall = self.recall(y_pred, y_verdadeiro)
        precision = self.precision(y_pred, y_verdadeiro)

        return 2 * (recall * precision) / (recall + precision)

In [None]:
def kfolds(x, y, modelo, params:dict,k=10):
    dados_treino = np.hstack([x, y.reshape(-1,1)])
    np.random.shuffle(dados_treino)

    k_partes = np.array_split(dados_treino, k)
    metricas = Metricas()

    comb = list(itertools.product(*params.values()))
    comb_dicts = [dict(zip(params.keys(), c)) for c in comb]

    metricas_dict = defaultdict(list)
    for combs in comb_dicts:
        m1 = modelo(**combs)
        acuracias = []
        recalls = []
        precisions = []
        f1_scores = []
        for i in range(k):
            teste = k_partes[i]
            treino = np.vstack(k_partes[:i] + k_partes[i+1:])

            x_treino,y_treino = treino[:, :-1], treino[:, -1]
            x_teste, y_teste = teste[:, :-1], teste[:, -1]

            m1.fit(x_treino, y_treino)
            y_pred = m1.predict(x_teste)

            acuracia, recall, precision, f1_score = metricas.report_geral(y_pred, y_teste)
            acuracias.append(acuracia)
            recalls.append(recall)
            precisions.append(precision)
            f1_scores.append(f1_score)

        metricas_dict[','.join([str(i)for i in list(combs.values())])] = {'Acuracia':(np.mean(acuracias), np.std(acuracias)), 'Recall':(np.mean(recalls), np.std(recalls)),
                                'Precision':(np.mean(precisions), np.std(precisions)), 'F1-Score': (np.mean(f1_scores), np.std(f1_scores))}


    return dict(metricas_dict)

## Implementa distâncias

Euclidiana e Malahanobis

In [None]:
def euclidiana(x_multiarray, x_ponto):
    distancias = []
    for ponto_ref in x_ponto:
        distancia_parcial = []
        for ponto in x_multiarray:
            distancia_parcial.append(np.sqrt(np.sum((ponto_ref - ponto)**2)))
        distancias.append(distancia_parcial)

    return distancias

def malahanobis(x_multiarray, x_ponto, matriz_inversa_cov):
    distancias = []
    for ponto_ref in x_ponto:
        distancia_parcial = []
        for ponto in x_multiarray:
            diff = ponto - ponto_ref
            distancia_parcial.append(diff.T @ matriz_inversa_cov @ diff)
        distancias.append(distancia_parcial)

    return distancias



In [None]:
class KNN:
    def __init__(self, k, distancia):
        self.k = k
        self.distancia = distancia

    def fit(self, x_treino, y_treino):
        self.x_treino = x_treino
        self.y_treino = y_treino
        self.matriz_inversa_cov = self.__matriz_inversa_cov()

    def predict(self, x):
        if self.distancia == 'malahanobis':
            distancias = malahanobis(self.x_treino, x, self.matriz_inversa_cov)
        elif self.distancia == 'euclidiana':
            distancias = euclidiana(self.x_treino, x)
        else:
            raise Exception('Não existe essa distância')

        k_menores_distancias_indexes = [np.argsort(d)[:self.k] for d in distancias]

        return np.array([float(moda(self.y_treino[linha])) for linha in k_menores_distancias_indexes])

    def __matriz_inversa_cov(self):
        m_cov = np.cov(self.x_treino, rowvar=False)

        return np.linalg.pinv(m_cov)

# Questão 1

In [None]:
kc2 = pd.read_csv('kc2.csv', header=None)
kc2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,4.0,1.0,1.0,1.0,4.0,8.0,0.67,1.5,5.33,12.0,...,2.0,0.0,0.0,0.0,3.0,1.0,3.0,1.0,1.0,0.0
1,39.0,4.0,1.0,2.0,105.0,520.19,0.07,13.89,37.44,7227.91,...,29.0,1.0,4.0,2.0,12.0,19.0,61.0,44.0,7.0,0.0
2,1.0,1.0,1.0,1.0,6.0,15.51,0.4,2.5,6.2,38.77,...,0.0,0.0,0.0,0.0,5.0,1.0,5.0,1.0,1.0,0.0
3,15.0,1.0,1.0,1.0,55.0,224.81,0.17,5.73,39.25,1287.55,...,12.0,0.0,1.0,0.0,6.0,11.0,34.0,21.0,1.0,0.0
4,12.0,2.0,1.0,2.0,15.0,45.0,0.17,6.0,7.5,270.0,...,8.0,1.0,0.0,0.0,6.0,2.0,11.0,4.0,3.0,0.0


In [None]:
x = kc2.iloc[:, :-1].apply(normaliza).to_numpy()
y = kc2.iloc[:, -1].to_numpy()

In [None]:
x_treino, x_teste, y_treino, y_teste = divide_treino_teste(x, y)

In [None]:
parametros = {'distancia': ['euclidiana', 'malahanobis'],
              'k':[1,5]}

organiza_texto(kfolds(x_treino, y_treino, KNN, parametros))

------------------------------------------------------------
Parâmetros: euclidiana,1
Acuracia(Media e DP): (np.float64(0.7660130718954248), np.float64(0.07450980392156861))
Recall(Media e DP): (np.float64(0.7866883116883117), np.float64(0.1370486822113567))
Precision(Media e DP): (np.float64(0.76495115995116), np.float64(0.11670256662966157))
F1-Score(Media e DP): (np.float64(0.7651524399233377), np.float64(0.09091958995337954))
------------------------------------------------------------
Parâmetros: euclidiana,5
Acuracia(Media e DP): (np.float64(0.7839869281045753), np.float64(0.10757989261421755))
Recall(Media e DP): (np.float64(0.7909307359307359), np.float64(0.12626356647763726))
Precision(Media e DP): (np.float64(0.7774603174603175), np.float64(0.11722316256299282))
F1-Score(Media e DP): (np.float64(0.7787342972159232), np.float64(0.10817810650992071))
------------------------------------------------------------
Parâmetros: malahanobis,1
Acuracia(Media e DP): (np.float64(0.725490

In [None]:
parametros = {'criterion':['gini', 'entropy']}
organiza_texto(kfolds(x_treino, y_treino, DecisionTreeClassifier, parametros))

------------------------------------------------------------
Parâmetros: gini
Acuracia(Media e DP): (np.float64(0.7013071895424836), np.float64(0.1009185037839825))
Recall(Media e DP): (np.float64(0.6897186147186147), np.float64(0.12270238988228606))
Precision(Media e DP): (np.float64(0.729047619047619), np.float64(0.16402256357143127))
F1-Score(Media e DP): (np.float64(0.6940538104153664), np.float64(0.10134511870828478))
------------------------------------------------------------
Parâmetros: entropy
Acuracia(Media e DP): (np.float64(0.6892156862745098), np.float64(0.09646281652100426))
Recall(Media e DP): (np.float64(0.7015440115440115), np.float64(0.13345856967718758))
Precision(Media e DP): (np.float64(0.6886835386835386), np.float64(0.1446185977384992))
F1-Score(Media e DP): (np.float64(0.6851904516192442), np.float64(0.11652166084166214))


# Testando com o modelo com melhor acurácia

In [None]:
knn = KNN(k=5, distancia='euclidiana')

knn.fit(x_treino, y_treino)

In [None]:
y_pred = knn.predict(x_teste)

In [None]:
metricas = Metricas()

metricas_score = metricas.report_geral(y_pred, y_teste)

print(f"Acuracia: {metricas_score[0]}\nRecall: {metricas_score[1]}'\nPrecision: {metricas_score[2]}\nF1-Score: {metricas_score[3]}")

Acuracia: 0.813953488372093
Recall: 0.8947368421052632'
Precision: 0.7391304347826086
F1-Score: 0.8095238095238095
