In [115]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [116]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# 1. Implementar classe KNN
class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        y_pred = []
        for i in range(X.shape[0]):
            distances = []
            for j in range(self.X_train.shape[0]):
                dist = euclidean_distance(X.iloc[i], self.X_train.iloc[j])
                distances.append((dist, self.y_train.iloc[j]))
            distances = sorted(distances)
            k_nearest = distances[:self.k]
            k_nearest_labels = [label for (_, label) in k_nearest]
            most_common = max(set(k_nearest_labels), key=k_nearest_labels.count)
            y_pred.append(most_common)
        return y_pred

In [117]:
# 2. Carregar os dados do conjunto "Credit Approval"
df = pd.read_csv("../data/raw/crx.data", header=None)
df.isin(["?"]).sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [118]:
# 3. Eliminar exemplos com NA
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
df.isin(["?"]).sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [119]:
# 4. Converter atributos categóricos em variáveis dummy
categorical_features = [0, 3, 4, 5, 6, 8, 9, 11, 12]
transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
df = transformer.fit_transform(df.astype(str))

In [120]:
# 5. Normalizar os atributos
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df[:, :-1]), columns=range(df.shape[1]-1))
scaled_df['class'] = df[:, -1]
df = scaled_df

In [121]:
# 6. Dividir o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['class'], test_size=0.3, random_state=42)

In [122]:
# 7. Executar o algoritmo KNN e calcular a acurácia
knn = KNN(k=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"A acurácia do modelo KNN é de {round(acc*100,2)}%.")

A acurácia do modelo KNN é de 83.67%.
