# Clasificador Minima distancia y 1NN

In [1]:
# Importar las librerias
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
def one_nn_classifier(X_train, y_train, X_test):
    def euclidean_distance(x1, x2):
        return sum((xi - yi) ** 2 for xi, yi in zip(x1, x2)) ** 0.5

    y_pred = [] # Lista para almacenar las predicciones

    # Predicción para cada muestra en el conjunto de prueba
    for x_test in X_test:
        # Inicializar la menor distancia como infinita y sin etiqueta
        min_distance = float('inf')
        closest_label = None

        # Buscar el vecino más cercano
        for i, x_train in enumerate(X_train):
            distance = euclidean_distance(x_test, x_train)
            if distance < min_distance:
                min_distance = distance
                closest_label = y_train[i]

        y_pred.append(closest_label) # Guardar la predicción
    return y_pred
    

In [3]:
def min_distance_classifier(X_train, y_train, X_test):
    # Calcular el centroide de cada clase 
    centroids = {}
    for label in np.unique(y_train):
        centroids[label] = X_train[y_train == label].mean(axis=0)
    
    # Predicción de clases para cada muestra de prueba
    y_pred = []
    for x in X_test:
        distances = {label: np.linalg.norm(x - centroid) for label, centroid in centroids.items()}
        closest_label = min(distances, key=distances.get)
        y_pred.append(closest_label)
    return np.array(y_pred)

In [4]:
def hold_out_validation(data, r=0.3):
    # Separar características y etiquetas
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values

    # Determinar el tamaño de prueba y mezclar índices
    n = len(X)
    test_size = int(n * r)
    indices = list(range(n))
    random.shuffle(indices)

    # Crear subconjuntos de entrenamiento y prueba
    train_indices = indices[:n - test_size]
    test_indices = indices[n - test_size:]
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    
    # Predicción con el clasificador de distancia mínima y 1nn
    y_pred_min_dist = min_distance_classifier(X_train, y_train, X_test)
    y_pred_1nn = one_nn_classifier(X_train, y_train, X_test)
    
    return y_test, y_pred_min_dist, y_pred_1nn

In [5]:
def k_fold_validation(data, k=10):
    # Separar características y etiquetas
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values

    # Dividir los datos en K partes (folds)
    folds = np.array_split(data, k)

    # Listas para acumular los resultados 
    y_tests = []
    y_preds_1nn = []
    y_preds_min_dist = []
    
    
    for i in range(k):
        # Crear conjuntos de prueba y entrenamiento
        test_data = folds[i]
        train_data = pd.concat([folds[j] for j in range(k) if j != i])

        X_train = train_data.iloc[:, :-1].values
        y_train = train_data.iloc[:, -1].values
        X_test = test_data.iloc[:, :-1].values
        y_test = test_data.iloc[:, -1].values
        
        # Predicciones usando el clasificador de distancia mínima y 1nn
        y_pred_min_dist = min_distance_classifier(X_train, y_train, X_test)
        y_pred_1nn = one_nn_classifier(X_train, y_train, X_test)
        
        # Acumular los resultados de prueba y predicción
        y_tests.extend(y_test)
        y_preds_min_dist.extend(y_pred_min_dist)
        y_preds_1nn.extend(y_pred_1nn)
    
    return y_tests, y_preds_min_dist, y_preds_1nn

In [6]:
# Funcion para separar el dataset en entrenamiento y prueba
def split_data(data, iteracion):
    indices = data.index.tolist()  # Lista de indices de las filas en el conjunto de datos
    test_data = data.iloc[[iteracion]]  # Seleccionar una fila (el patrón para prueba)
    train_data = data.drop(iteracion)  # Todas las filas excepto la de test (para entrenamiento)
    return train_data, test_data
    
def leave_one_out_validation(data):
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    
    y_tests = []
    y_preds_1nn = []
    y_preds_min_dist = []
    
    
    
    for i in range(len(data)):
        # Separar un punto para prueba y el resto para entrenamiento
        train_data, test_data = split_data(data, i)
        X_train = train_data.iloc[:, :-1].values
        y_train = train_data.iloc[:, -1].values
        X_test = test_data.iloc[:, :-1].values
        y_test = test_data.iloc[:, -1].values
        
        # Predicciones usando el clasificador de distancia mínima y 1nn
        y_pred_min_dist = min_distance_classifier(X_train, y_train, X_test)
        y_pred_1nn = one_nn_classifier(X_train, y_train, X_test)
        
        y_tests.extend(y_test)
        y_preds_min_dist.extend(y_pred_min_dist)
        y_preds_1nn.extend(y_pred_1nn)
    
    return y_tests, y_preds_min_dist, y_preds_1nn

In [7]:
def evaluate_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print("Accuracy:", accuracy)
    print("Matriz de Confusión:\n", conf_matrix)

In [8]:
# Cargar y validar cada dataset
datasets = ['raisin.csv', 'glass.csv', 'iris.csv']

for dataset in datasets:
    data = pd.read_csv(dataset)

    print(f"*******************************************************************************************")
    print(f"\n--- Validación en {dataset} ---")
    
    # Hold-Out 70/30 Estratificado
    y_test, y_preds_min_dist, y_preds_1nn = hold_out_validation(data)
    print("\n---- MIN-DISTANCIA ----\nHold-Out (70/30 Estratificado): ")
    evaluate_performance(y_test, y_preds_min_dist)
    print("\n---- 1NN ----\nHold-Out (70/30 Estratificado):")
    evaluate_performance(y_test, y_preds_1nn)
    
    # 10-Fold Cross-Validation Estratificado
    y_test, y_preds_min_dist, y_preds_1nn = k_fold_validation(data)
    print("\n---- MIN-DISTANCIA ----\n10-Fold Cross-Validation Estratificado:")
    evaluate_performance(y_test, y_preds_min_dist)
    print("\n---- 1NN ----\n10-Fold Cross-Validation Estratificado:")
    evaluate_performance(y_test, y_preds_1nn)
    
    # Leave-One-Out
    y_test, y_preds_min_dist, y_preds_1nn  = leave_one_out_validation(data)
    print("\n---- MIN-DISTANCIA ----\nLeave-One-Out:")
    evaluate_performance(y_test, y_preds_min_dist)
    print("\n---- 1NN ----\nLeave-One-Out:")
    evaluate_performance(y_test, y_preds_1nn)

*******************************************************************************************

--- Validación en raisin.csv ---

---- MIN-DISTANCIA ----
Hold-Out (70/30 Estratificado): 
Accuracy: 0.837037037037037
Matriz de Confusión:
 [[107  33]
 [ 11 119]]

---- 1NN ----
Hold-Out (70/30 Estratificado):
Accuracy: 0.7851851851851852
Matriz de Confusión:
 [[112  28]
 [ 30 100]]


  return bound(*args, **kwds)



---- MIN-DISTANCIA ----
10-Fold Cross-Validation Estratificado:
Accuracy: 0.8077777777777778
Matriz de Confusión:
 [[311 139]
 [ 34 416]]

---- 1NN ----
10-Fold Cross-Validation Estratificado:
Accuracy: 0.7577777777777778
Matriz de Confusión:
 [[346 104]
 [114 336]]

---- MIN-DISTANCIA ----
Leave-One-Out:
Accuracy: 0.8088888888888889
Matriz de Confusión:
 [[311 139]
 [ 33 417]]

---- 1NN ----
Leave-One-Out:
Accuracy: 0.7833333333333333
Matriz de Confusión:
 [[352  98]
 [ 97 353]]
*******************************************************************************************

--- Validación en glass.csv ---

---- MIN-DISTANCIA ----
Hold-Out (70/30 Estratificado): 
Accuracy: 0.9206349206349206
Matriz de Confusión:
 [[17  0  0  0  0  0]
 [ 0 23  5  0  0  0]
 [ 0  0  6  0  0  0]
 [ 0  0  0  3  0  0]
 [ 0  0  0  0  1  0]
 [ 0  0  0  0  0  8]]

---- 1NN ----
Hold-Out (70/30 Estratificado):
Accuracy: 1.0
Matriz de Confusión:
 [[17  0  0  0  0  0]
 [ 0 28  0  0  0  0]
 [ 0  0  6  0  0  0]
 [ 0  0

  return bound(*args, **kwds)



---- MIN-DISTANCIA ----
10-Fold Cross-Validation Estratificado:
Accuracy: 0.8450704225352113
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 6 53 17  0  0  0]
 [ 0  0 16  1  0  0]
 [ 0  0  0 13  0  0]
 [ 0  0  0  9  0  0]
 [ 0  0  0  0  0 29]]

---- 1NN ----
10-Fold Cross-Validation Estratificado:
Accuracy: 0.8967136150234741
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 7 63  6  0  0  0]
 [ 0  0 17  0  0  0]
 [ 0  0  0 13  0  0]
 [ 0  0  0  6  0  3]
 [ 0  0  0  0  0 29]]

---- MIN-DISTANCIA ----
Leave-One-Out:
Accuracy: 0.8873239436619719
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 2 59 15  0  0  0]
 [ 0  0 16  1  0  0]
 [ 0  0  0 12  1  0]
 [ 0  0  0  0  9  0]
 [ 0  0  0  0  5 24]]

---- 1NN ----
Leave-One-Out:
Accuracy: 0.9906103286384976
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 0 76  0  0  0  0]
 [ 0  0 17  0  0  0]
 [ 0  0  1 11  1  0]
 [ 0  0  0  0  9  0]
 [ 0  0  0  0  0 29]]
*******************************************************************************************

--- V

  return bound(*args, **kwds)



---- MIN-DISTANCIA ----
Leave-One-Out:
Accuracy: 0.9194630872483222
Matriz de Confusión:
 [[49  0  0]
 [ 0 45  5]
 [ 0  7 43]]

---- 1NN ----
Leave-One-Out:
Accuracy: 0.959731543624161
Matriz de Confusión:
 [[49  0  0]
 [ 0 47  3]
 [ 0  3 47]]
