# Clasificador KNN

In [1]:
# Importar las librerías
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score, confusion_matrix
from collections import Counter

In [2]:
# Definir el clasificador KNN
def knn_classifier(X_train, y_train, X_test, k):
    def euclidean_distance(x1, x2):
        return sum((xi - yi) ** 2 for xi, yi in zip(x1, x2)) ** 0.5

    y_pred = []  # Lista para almacenar las predicciones

    for x_test in X_test:
        # Calcular la distancia de x_test a todos los puntos en X_train
        distances = [(euclidean_distance(x_test, x_train), label) for x_train, label in zip(X_train, y_train)]
        # Ordenar por distancia y seleccionar los k vecinos más cercanos
        k_nearest_neighbors = sorted(distances, key=lambda x: x[0])[:k]
        # Obtener las etiquetas de los k vecinos y elegir la más común
        k_nearest_labels = [label for _, label in k_nearest_neighbors]
        most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
        y_pred.append(most_common_label)
        
    return y_pred

In [3]:
# Funcion de validacion Hold-Out
def hold_out_validation(data, k, r=0.3):
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values

    n = len(X)
    test_size = int(n * r)
    indices = list(range(n))
    random.shuffle(indices)

    train_indices = indices[:n - test_size]
    test_indices = indices[n - test_size:]
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    
    # Predicción con KNN
    y_pred_knn = knn_classifier(X_train, y_train, X_test, k=k)
    
    return y_test, y_pred_knn

In [4]:
# Funcion de validacion K fold cross validation
def k_fold_validation(data, k, num_folds=10):
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    folds = np.array_split(data, num_folds)

    y_tests = []
    y_preds_knn = []
    
    for i in range(num_folds):
        test_data = folds[i]
        train_data = pd.concat([folds[j] for j in range(num_folds) if j != i])

        X_train = train_data.iloc[:, :-1].values
        y_train = train_data.iloc[:, -1].values
        X_test = test_data.iloc[:, :-1].values
        y_test = test_data.iloc[:, -1].values
        
        y_pred_knn = knn_classifier(X_train, y_train, X_test, k=k)
        
        y_tests.extend(y_test)
        y_preds_knn.extend(y_pred_knn)
    
    return y_tests, y_preds_knn

In [5]:
def split_data(data, iteracion):
    indices = data.index.tolist()  # Lista de índices de las filas en el conjunto de datos
    test_data = data.iloc[[iteracion]]  # Seleccionar una fila (el patrón para prueba)
    train_data = data.drop(iteracion)  # Todas las filas excepto la de test (para entrenamiento)
    return train_data, test_data

# Funcion de validacion leave one out
def leave_one_out_validation(data, k):
    y_tests = []
    y_preds_knn = []

    for i in range(len(data)):
        # Separar un punto para prueba y el resto para entrenamiento
        train_data, test_data = split_data(data, i)
        X_train = train_data.iloc[:, :-1].values
        y_train = train_data.iloc[:, -1].values
        X_test = test_data.iloc[:, :-1].values
        y_test = test_data.iloc[:, -1].values
        
        # Predicción usando KNN
        y_pred_knn = knn_classifier(X_train, y_train, X_test, k=k)
        
        y_tests.extend(y_test)
        y_preds_knn.extend(y_pred_knn)
    
    return y_tests, y_preds_knn

In [6]:
# Función para evaluar el desempeño
def evaluate_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print("Accuracy:", accuracy)
    print("Matriz de Confusión:\n", conf_matrix)

In [7]:
# Validacion de raisin.csv
data_raisin = pd.read_csv('raisin.csv')
k_raisin = 3

print(f"--- Validación en raisin ---")
    
# Hold-Out 70/30
y_test, y_pred_knn = hold_out_validation(data_raisin, k_raisin)
print("\nHold-Out (70/30 Estratificado): ")
evaluate_performance(y_test, y_pred_knn)
    
# 10-Fold Cross-Validation
y_test, y_pred_knn = k_fold_validation(data_raisin, k_raisin)
print("\n10-Fold Cross-Validation Estratificado:")
evaluate_performance(y_test, y_pred_knn)
    
# Leave-One-Out
y_test, y_pred_knn = leave_one_out_validation(data_raisin, k_raisin)
print("\nLeave-One-Out:")
evaluate_performance(y_test, y_pred_knn)

--- Validación en raisin ---

Hold-Out (70/30 Estratificado): 
Accuracy: 0.8333333333333334
Matriz de Confusión:
 [[104  20]
 [ 25 121]]


  return bound(*args, **kwds)



10-Fold Cross-Validation Estratificado:
Accuracy: 0.7966666666666666
Matriz de Confusión:
 [[351  99]
 [ 84 366]]

Leave-One-Out:
Accuracy: 0.8177777777777778
Matriz de Confusión:
 [[358  92]
 [ 72 378]]


In [8]:
# Validacion de glass.csv
data_raisin = pd.read_csv('glass.csv')
k_glass = 5

print(f"--- Validación en glass ---")
    
# Hold-Out 70/30
y_test, y_pred_knn = hold_out_validation(data_raisin, k_glass)
print("\nHold-Out (70/30 Estratificado): ")
evaluate_performance(y_test, y_pred_knn)
    
# 10-Fold Cross-Validation
y_test, y_pred_knn = k_fold_validation(data_raisin, k_glass)
print("\n10-Fold Cross-Validation Estratificado:")
evaluate_performance(y_test, y_pred_knn)
    
# Leave-One-Out
y_test, y_pred_knn = leave_one_out_validation(data_raisin, k_glass)
print("\nLeave-One-Out:")
evaluate_performance(y_test, y_pred_knn)

--- Validación en glass ---

Hold-Out (70/30 Estratificado): 
Accuracy: 0.9682539682539683
Matriz de Confusión:
 [[21  0  0  0  0  0]
 [ 0 18  0  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  1  3  0  0]
 [ 0  0  0  1  3  0]
 [ 0  0  0  0  0 12]]


  return bound(*args, **kwds)



10-Fold Cross-Validation Estratificado:
Accuracy: 0.892018779342723
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 7 63  6  0  0  0]
 [ 0  0 16  1  0  0]
 [ 0  0  0 13  0  0]
 [ 0  0  0  6  0  3]
 [ 0  0  0  0  0 29]]

Leave-One-Out:
Accuracy: 0.9765258215962441
Matriz de Confusión:
 [[69  0  0  0  0  0]
 [ 0 75  1  0  0  0]
 [ 0  1 16  0  0  0]
 [ 0  0  1 10  2  0]
 [ 0  0  0  0  9  0]
 [ 0  0  0  0  0 29]]


In [9]:
# Validacion de iris.csv
data_raisin = pd.read_csv('iris.csv')
k_iris = 3

print(f"--- Validación en raisin ---")
    
# Hold-Out 70/30
y_test, y_pred_knn = hold_out_validation(data_raisin, k_iris)
print("\nHold-Out (70/30 Estratificado): ")
evaluate_performance(y_test, y_pred_knn)
    
# 10-Fold Cross-Validation
y_test, y_pred_knn = k_fold_validation(data_raisin, k_iris)
print("\n10-Fold Cross-Validation Estratificado:")
evaluate_performance(y_test, y_pred_knn)
    
# Leave-One-Out
y_test, y_pred_knn = leave_one_out_validation(data_raisin, k_iris)
print("\nLeave-One-Out:")
evaluate_performance(y_test, y_pred_knn)

--- Validación en raisin ---

Hold-Out (70/30 Estratificado): 
Accuracy: 0.9318181818181818
Matriz de Confusión:
 [[13  0  0]
 [ 0 16  2]
 [ 0  1 12]]


  return bound(*args, **kwds)



10-Fold Cross-Validation Estratificado:
Accuracy: 0.9463087248322147
Matriz de Confusión:
 [[49  0  0]
 [ 0 46  4]
 [ 0  4 46]]

Leave-One-Out:
Accuracy: 0.959731543624161
Matriz de Confusión:
 [[49  0  0]
 [ 0 47  3]
 [ 0  3 47]]
