# Questão 1: Considere o conjunto de dados disponível em kc2.csv, organizado em 22 colunas, sendo as 21 primeiras colunas os atributos e a última coluna a saída. Os 21 atributos são referentes à caracterização de códigos-fontes para processamento de dados na NASA. A saída é a indicação de ausência (0) ou existência (1) de defeitos (os dados foram balanceados via subamostragem). Maiores detalhes sobre os dados podem ser conferidos em https://www.openml.org/search?type=data&sort=runs&id=1063&status=active.

In [71]:
!pip install numpy scikit-learn matplotlib pandas Jinja2

Collecting Jinja2
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting MarkupSafe>=2.0 (from Jinja2)
  Using cached MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Using cached MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)
Installing collected packages: MarkupSafe, Jinja2
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [Jinja2]━━━━[0m [32m1/2[0m [Jinja2]
[1A[2KSuccessfully installed Jinja2-3.1.6 MarkupSafe-3.0.2


In [73]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [18]:
data = np.genfromtxt('kc2.csv', delimiter=',')
x = data[:,:-1]
y = data[:,-1].astype(int)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [43]:
def kfold(X, y, k, random_state=42):
    # Embaralha os dados
    indices = np.arange(X.shape[0])
    np.random.seed(random_state)
    np.random.shuffle(indices)
    
    # Divide os dados em k folds
    folds = np.array_split(indices, k)
    
    for i in range(k):
        # Cria os conjuntos de treino e teste
        test_indices = folds[i]
        train_indices = np.concatenate(folds[:i] + folds[i+1:])
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        yield X_train, X_test, y_train, y_test

In [87]:
def normaliza(X):
    # Calculando a média e a distribuição 
    mean = X.sum(axis = 0) / X.shape[0]
    std = np.sqrt(((X-mean)**2).sum(axis=0)/(X.shape[0]-1))
    std[std == 0] = 1  # Evita divisão por zero
    # Normaliza os dados
    return ((X - mean) / std), mean, std

In [84]:
def inner_loop(x, y, model_class, grid, internal_kfold=10, scale_flag=True, verbose=True, random_state=12345):
    param_names = list(grid.keys())
    grid_search = np.meshgrid(*grid.values())
    grid_search = np.hstack([ np.atleast_2d(g.ravel()).T for g in grid_search ], dtype='object')
    
    for i, params in enumerate(grid_search):
        param_dict = dict(zip(param_names, params))

        if verbose:
            print(f"Testing parameters: {param_dict}")
        
        fold_scores = []
        
        for x_train, x_test, y_train, y_test in kfold(x, y, k=internal_kfold, random_state=random_state):
                        
            if scale_flag:
                x_train, mean, std = normaliza(x_train)
                x_test, mean_test, mean_test = normaliza(x_test)
                model = model_class(**param_dict)
            else:
                model = model_class(**param_dict)
            
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test)
            fold_scores.append(accuracy_score(y_test, y_pred))
        
        yield param_dict, fold_scores

In [85]:
def run_nested_cv(x, y, model_class, grid, external_kfold=5, internal_kfold=5, scale_flag=True, verbose=True, random_state=12345):
    metrics = {'acc': [], 'rec': [], 'prec': [], 'f1': []}

    best_model = None
    best_score = -np.inf
    
    for x_train, x_test, y_train, y_test in kfold(x, y, k=external_kfold, random_state=random_state):

        for params, fold_scores in inner_loop(x_train, y_train, model_class, grid, internal_kfold, scale_flag, verbose, random_state):
            mean_score = np.mean(fold_scores)
            std_score = np.std(fold_scores)
            if verbose:
                print(f"Mean accuracy for parameters {params}: {mean_score:.4f}")
                print(f"Standard deviation: {std_score:.4f}")

            if mean_score > best_score:
                best_score = mean_score
                best_model = model_class(**params)
                
                best_model.fit(x_train, y_train)

        y_pred = best_model.predict(x_test)
        metrics['acc'].append(accuracy_score(y_test, y_pred))
        metrics['rec'].append(recall_score(y_test, y_pred, average='macro'))
        metrics['prec'].append(precision_score(y_test, y_pred, average='macro'))
        metrics['f1'].append(f1_score(y_test, y_pred, average='macro'))

                           
    return metrics, best_model

## Questão 1 a) Considerando uma validação cruzada em 10 folds, avalie modelos de classificação binária nos dados em questão. Para tanto, use as abordagens abaixo

### – KNN (escolha k = 1 e k = 5, distância Euclidiana e Mahalonobis, totalizando 4 combinações);
### – Árvore de decisão (você pode usar uma implementação já existente, como a do scikit-learn, com índices de impureza de gini e entropia)

In [20]:
# Função para calcular distância Euclidiana
def euclidean_distance(p1, p2):
    soma = 0
    for i in range(len(p1)):
        soma += (p1[i] - p2[i]) ** 2
    return soma ** 0.5

In [37]:
# Função para calcular a distância de Mahalonobis
def mahalanobis_distance(p1, p2, cov_inv):
    diff = p1 - p2
    return np.sqrt(np.dot(np.dot(diff, cov_inv), diff.T))

In [36]:
# Função para encontrar os K vizinhos mais próximos
def get_neighbors(train_data, train_labels, test_point, k, metric='euclidean'):
    distancias = []
    for i in range(len(train_data)):
        if metric == 'mahalanobis':
            cov_inv = np.linalg.inv(np.cov(train_data.T))
            distancia = mahalanobis_distance(test_point, train_data[i], cov_inv)
        elif metric == 'euclidean':
            distancia = euclidean_distance(test_point, train_data[i])
        distancias.append((distancia, train_labels[i]))
    
    distancias.sort()  # Ordena pela menor distância
    vizinhos = distancias[:k]
    return [label for (_, label) in vizinhos]

In [35]:
# Função para determinar a classe mais comum entre os vizinhos
def vote(vizinhos):
    contagem = {}
    for label in vizinhos:
        contagem[label] = contagem.get(label, 0) + 1
    return max(contagem, key=contagem.get)

In [53]:
# Função principal do KNN
class KNN:
    def __init__(self, k=2, metric='euclidean'):
        self.k = k
        self.metric = metric

    def fit(self, X, y):
        self.train_data = X
        self.train_labels = y

    def predict(self, X):
        return knn_predict(self.train_data, self.train_labels, X, self.k, self.metric)
    
def knn_predict(train_data, train_labels, test_data, k=2, metric='euclidean'):
    predicoes = []
    for ponto in test_data:
        vizinhos = get_neighbors(train_data, train_labels, ponto, k,metric=metric)
        classe = vote(vizinhos)
        predicoes.append(classe)
    return predicoes

In [88]:
# Executando o KNN com os dados de treino e teste
external_kfold = 10
internal_kfold = 10

methods_summary = {'KNN': {'class': KNN, 'scale': True},
                   'DT' : {'class': DecisionTreeClassifier, 'scale': False}}

# KNN
methods_summary['KNN']['grid'] = {'k': [1, 5],         # k - número de vizinhos
                                  'metric': ['euclidean','mahalanobis']}        # metric            
# Decision Tree
methods_summary['DT']['grid'] = {'criterion': ['gini', 'entropy'],                # criterion
                                 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, None]} # max_depth

trained_models = {}
for method, info in methods_summary.items():
    print(f"\n[{method}] Running nested K-fold...")
    metrics, best_model = run_nested_cv(x=x, y=y, model_class=info['class'],
                                        grid=info['grid'], scale_flag=info['scale'],
                                        external_kfold=external_kfold, internal_kfold=internal_kfold, verbose=False, random_state=42)
    trained_models[method] = {'metrics': metrics, 'model': best_model}

# Results
results = { method : info['metrics'] for method, info in trained_models.items() }


[KNN] Running nested K-fold...

[DT] Running nested K-fold...


## Questão 1 b) Para cada modelo criado, reporte valor médio e desvio padrão das métricas de acurácia, revocação, precisão e F1-score.

In [90]:
table = pd.DataFrame(results).T.map(lambda x: f"{np.mean(x):.2%} +- {1.96*np.std(x)/np.sqrt(len(x)):.2%}")
table.columns = ['Accuracy', 'Recall', 'Precision', 'F1-score']
table.index = results.keys()
def extract_from_text(text):
    return float(text.split('%')[0])
table.style.apply(lambda col: [ 'font-weight:bold; color:red' if extract_from_text(x)==col.apply(extract_from_text).max() else '' for x in col ])

Unnamed: 0,Accuracy,Recall,Precision,F1-score
KNN,80.45% +- 8.06%,80.77% +- 7.98%,80.73% +- 7.94%,80.24% +- 8.01%
DT,81.36% +- 6.91%,81.44% +- 7.00%,81.54% +- 6.90%,81.03% +- 6.92%
