# Introdução ao Reconhecimento de Padrões, 2020.2, UFC/DETI
## Trabalho 1

Aluno : Thyago Freitas da Silva <br>
Matrícula : 392035

In [1]:
import numpy as np
import operator
import collections
import pandas
from random import randrange
from sklearn.metrics import confusion_matrix as cm
#from sklearn.datasets import load_iris
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score

### Implementacão do classificador "K Vizinhos mais Proximos (KNN)"

In [2]:
norm_p = lambda x,y,p : abs((x-y))**p
euclidian = lambda list1,list2: np.sqrt(sum(map(norm_p,list1,list2,[2]*len(list1))))
manhatan = lambda list1,list2: sum(map(norm_p,list1,list2,[1]*len(list1)))

metrics = {
    "euclidian" : euclidian,
    "manhatan" : manhatan
}

class KNNClassifier:
    def __init__(self, metric="euclidian", n_neighbors=3):
        if metric not in metrics:
            message = "invalid metric. the acceptable values are :"
            for k in metrics.keys():
                message += " " + k
            raise Exception(message)
        self.n_neighbors = n_neighbors
        self.metric_name = metric
        self.metric_func = metrics[metric]
    def fit(self, x_train,y_train):
        if len(x_train) != len(y_train):
            raise Exception("the size of inputs must be equals")
        self.x_train = x_train
        self.y_train = y_train
    def predict(self,x_test):
        distances = []
        result = []
        for test in x_test:
            for index in range(len(self.x_train)):
                distance = self.metric_func(self.x_train[index],test)
                distances.append((self.y_train[index], distance))
            distances = sorted(distances, key = lambda tup : tup[1])
            classes = collections.Counter(map(lambda x : x[0], distances[:self.n_neighbors]))
            clas = classes.most_common(1)
            result.append(clas[0][0])
            distances.clear()
        return result

### Leitura da base "demartology"

In [3]:
data = pandas.read_csv("./data/dermatology.csv", header=None)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       366 non-null    int64 
 1   1       366 non-null    int64 
 2   2       366 non-null    int64 
 3   3       366 non-null    int64 
 4   4       366 non-null    int64 
 5   5       366 non-null    int64 
 6   6       366 non-null    int64 
 7   7       366 non-null    int64 
 8   8       366 non-null    int64 
 9   9       366 non-null    int64 
 10  10      366 non-null    int64 
 11  11      366 non-null    int64 
 12  12      366 non-null    int64 
 13  13      366 non-null    int64 
 14  14      366 non-null    int64 
 15  15      366 non-null    int64 
 16  16      366 non-null    int64 
 17  17      366 non-null    int64 
 18  18      366 non-null    int64 
 19  19      366 non-null    int64 
 20  20      366 non-null    int64 
 21  21      366 non-null    int64 
 22  22      366 non-null    in

#### Pre-processamento

Olhando o conteúdo do arquivo, foi notado que algumas linhas da coluna 33 (band-like infiltrate) apresentam valores marcados com "?" que atrapalham o resultado do algoritmo. Para remediar esse problema , decidi trocar os valores faltantes demarcados por "?" pela mediana da coluna 33, pois tal medida é menos sensível a outliers se comparado com a média, por exemplo.

In [4]:
def calc_median(array):
    values = []
    for v in array:
        if v != '?':
            v = int(v)
            values.append(v)
    return np.median(values)

median = calc_median(data.iloc[:,33])
data.iloc[:,33] = list(map(lambda value: median if value == '?' else value, data.iloc[:,33]))
data.iloc[:,33] = data.iloc[:,33].astype(np.int64)

#### Separar dados em atributos e classes de saída

In [5]:
atribbutes = data.iloc[:,:34]
target = data.iloc[:,34]

#### Separar em bases de teste e treino

In [10]:
def train_test_split(data,target,size=0.3):
    test_size = int(len(target)*size)
    numbers = []
    x_train,y_train,x_test,y_test = [],[],[],[]
    while len(numbers) != test_size:
        v = randrange(len(target))
        if v not in numbers:
            numbers.append(v)
            x_test.append(data.iloc[v,:].values)
            y_test.append(target[v])
    for i in range(len(data)):
        if i not in numbers:
            x_train.append(data.iloc[i,:].values)
            y_train.append(target[i])
    return x_train,y_train,x_test,y_test

def accuracy_score(prediction,real_values):
    size = len(real_values)
    corrects = 0
    for index in range(size):
        if prediction[index] == real_values[index]:
            corrects += 1
    return corrects/size

def accuracy_score_per_class(prediction,real_values,classe):
    size = len(real_values)
    corrects = 0
    total = 0
    for index in range(size):
        if real_values[index] == classe:
            total += 1
        if prediction[index] == real_values[index] == classe:
            corrects += 1
    return corrects/total

def confusion_matrix(predict,real_values):
    classes = set(real_values)
    n_predicts = len(predict)
    n_classes = len(classes)
    confusion_m = np.zeros((n_classes,n_classes))
    for cl in classes:
        for index in range(n_predicts):
            if real_values[index] == cl:
                confusion_m[predict[index]-1,real_values[index]-1] += 1
    return confusion_m

#### Taxa média de acerto para 100 rodadas de treinamento/teste.

In [7]:
results = []
size_tests = 100
KNN = KNNClassifier('euclidian',3)
for index in range(size_tests):
    x_train,y_train,x_test,y_test = train_test_split(atribbutes,target,0.3)
    KNN.fit(x_train,y_train)
    predictions = KNN.predict(x_test)
    results.append((predictions,y_test))

In [11]:
accuracies = []
for r in results:
    accuracies.append(accuracy_score(r[0],r[1]))
print(np.mean(accuracies)*100)

87.38532110091742


#### Taxas médias de acerto por classe.

In [12]:
classes = set(target)
for c in classes:
    score = []
    for r in results:
        score.append(accuracy_score_per_class(predictions,y_test,c))
    print("{0} : {1}%".format(c,np.mean(score)*100))
    score.clear()

1 : 97.29729729729729%
2 : 71.4285714285714%
3 : 100.0%
4 : 75.0%
5 : 77.77777777777776%
6 : 100.0%


In [9]:
acc = accuracy_score(results[0][0],results[0][1])
higher_value = acc
lower_value = acc
lower = results[0]
higher = results[0]
for index in range(1,len(results)):
    accuracy = accuracy_score(results[index][0],results[index][1])
    if accuracy > higher_value:
        higher = results[index]
        higher_value = accuracy
    if accuracy < lower_value:
        lower = results[index]
        lower_value = accuracy

print("Accuracy Higher: {:.2f}%".format(accuracy_score(higher[0],higher[1])*100))
print(confusion_matrix(higher[0],higher[1]))
print("Accurac Lower: {:.2f}%".format(accuracy_score(lower[0],lower[1])*100))
print(confusion_matrix(lower[0],lower[1]))

Accuracy Higher: 93.58%
[[34.  0.  0.  0.  0.  0.]
 [ 0. 14.  0.  1.  2.  0.]
 [ 0.  0. 26.  0.  0.  0.]
 [ 0.  1.  0. 13.  1.  0.]
 [ 0.  0.  0.  2. 12.  0.]
 [ 0.  0.  0.  0.  0.  3.]]
Accurac Lower: 79.82%
[[30.  0.  0.  0.  0.  0.]
 [ 0.  9.  0.  2.  0.  0.]
 [ 0.  0. 31.  0.  0.  0.]
 [ 0. 14.  0.  8.  2.  0.]
 [ 0.  0.  0.  0.  6.  0.]
 [ 2.  2.  0.  0.  0.  3.]]
