In [125]:
# Importar las librerias
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score, confusion_matrix
from collections import Counter

In [127]:
# Función para calcular la distancia Euclidiana
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Implementación manual de KNN para encontrar los k vecinos más cercanos
def find_k_nearest_neighbors(X_train, x_test, k):
    distances = []
    for i, x_train in enumerate(X_train):
        distance = euclidean_distance(x_train, x_test)
        distances.append((distance, i))  # Guardamos la distancia y el índice
    distances.sort(key=lambda x: x[0])  # Ordenar por distancia
    return [idx for _, idx in distances[:k]]  # Retornar los índices de los k vecinos más cercanos

In [129]:
# Función de SMOTE
def smote(X, y, k_sm=5):
    counter = Counter(y)
    max_count = max(counter.values())
    minority_class = min(counter, key=counter.get)
    majority_class = max(counter, key=counter.get)
    
    X_minority = X[y == minority_class]
    num_samples_to_generate = max_count - counter[minority_class]
    
    synthetic_samples = []
    
    # Generar muestras sintéticas
    for _ in range(num_samples_to_generate):
        # Seleccionar una muestra aleatoria de la clase minoritaria
        idx = np.random.randint(0, X_minority.shape[0])
        point = X_minority[idx]
        
        # Encontrar los k vecinos más cercanos
        neighbors_idx = find_k_nearest_neighbors(X_minority, point, k)
        
        # Seleccionar un vecino aleatorio
        neighbor_idx = np.random.choice(neighbors_idx)
        neighbor = X_minority[neighbor_idx]
        
        # Interpolación aleatoria entre la muestra y su vecino
        alpha = np.random.rand()
        synthetic_sample = point + alpha * (neighbor - point)
        synthetic_samples.append(synthetic_sample)
    
    # Convertir las muestras sintéticas en un array numpy
    X_synthetic = np.array(synthetic_samples)
    y_synthetic = np.full(X_synthetic.shape[0], minority_class)
    
    # Combinar los datos originales con los sintéticos
    X_sm = np.vstack([X, X_synthetic])
    y_sm = np.hstack([y, y_synthetic])
    
    return X_sm, y_sm

In [131]:
def one_nn_classifier(X_train, y_train, X_test):
    def euclidean_distance(x1, x2):
        return sum((xi - yi) ** 2 for xi, yi in zip(x1, x2)) ** 0.5

    y_pred = [] # Lista para almacenar las predicciones

    # Predicción para cada muestra en el conjunto de prueba
    for x_test in X_test:
        # Inicializar la menor distancia como infinita y sin etiqueta
        min_distance = float('inf')
        closest_label = None

        # Buscar el vecino más cercano
        for i, x_train in enumerate(X_train):
            distance = euclidean_distance(x_test, x_train)
            if distance < min_distance:
                min_distance = distance
                closest_label = y_train[i]

        y_pred.append(closest_label) # Guardar la predicción
    return y_pred

In [133]:
# Definir el clasificador KNN
def knn_classifier(X_train, y_train, X_test, k):
    def euclidean_distance(x1, x2):
        return sum((xi - yi) ** 2 for xi, yi in zip(x1, x2)) ** 0.5

    y_pred = []  # Lista para almacenar las predicciones

    for x_test in X_test:
        # Calcular la distancia de x_test a todos los puntos en X_train
        distances = [(euclidean_distance(x_test, x_train), label) for x_train, label in zip(X_train, y_train)]
        # Ordenar por distancia y seleccionar los k vecinos más cercanos
        k_nearest_neighbors = sorted(distances, key=lambda x: x[0])[:k]
        # Obtener las etiquetas de los k vecinos y elegir la más común
        k_nearest_labels = [label for _, label in k_nearest_neighbors]
        most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
        y_pred.append(most_common_label)
        
    return y_pred

In [135]:
def hold_out_validation(data,k,r=0.3):
    # Separar características y etiquetas
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values

    # Verificar y limpiar etiquetas
    valid_indices = ~pd.isnull(y)
    X = X[valid_indices]
    y = y[valid_indices]
    y = y.astype(int)

    # Determinar el tamaño de prueba y mezclar índices
    n = len(X)
    test_size = int(n * r)
    indices = list(range(n))
    random.shuffle(indices)

    # Crear subconjuntos de entrenamiento y prueba
    train_indices = np.array(indices[:n - test_size])
    test_indices = np.array(indices[n - test_size:])
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

    counter = Counter(y_train)
    print('Before', counter)
     # Predicción con el clasificador de knn y 1nn
    y_pred_knn= knn_classifier(X_train, y_train, X_test, k=k)
    y_pred_1nn = one_nn_classifier(X_train, y_train, X_test)

    # Oversampling the train dataset using SMOTE
    X_train_sm, y_train_sm = smote(X_train, y_train,k_sm=5)

    counter = Counter(y_train_sm)
    print('After SMOTE:', counter)

    # Predicción con el clasificador de knn y 1nn
    y_pred_knn_sm= knn_classifier(X_train_sm, y_train_sm, X_test, k=k)
    y_pred_1nn_sm= one_nn_classifier(X_train_sm, y_train_sm, X_test)

    #if y_train == y_train_sm:
    #   print(" Hold out SON IGUALES")

    return y_test, y_pred_knn, y_pred_1nn, y_pred_knn_sm, y_pred_1nn_sm

In [137]:
def k_fold_validation(data,k,num_folds=10):
    # Separar características y etiquetas
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values

    # Dividir los datos en K partes (folds)
    folds = np.array_split(data, num_folds)

    # Listas para acumular los resultados 
    y_tests = []
    y_preds_1nn = []
    y_preds_knn = []
    y_preds_1nn_sm = []
    y_preds_knn_sm = []
    
    
    
    for i in range(num_folds):
        # Crear conjuntos de prueba y entrenamiento
        test_data = folds[i]
        train_data = pd.concat([folds[j] for j in range(num_folds) if j != i])

        X_train = train_data.iloc[:, :-1].values
        y_train = train_data.iloc[:, -1].values
        X_test = test_data.iloc[:, :-1].values
        y_test = test_data.iloc[:, -1].values

        counter = Counter(y_train)
        print('Before', counter)
        
        # Predicciones usando el clasificador de knn y 1nn
        y_pred_knn = knn_classifier(X_train, y_train, X_test,k=k)
        y_pred_1nn = one_nn_classifier(X_train, y_train, X_test)

        
        # oversampling the train dataset using SMOTE
        X_train_sm, y_train_sm = smote(X_train, y_train,k_sm=5)
        
        counter = Counter(y_train_sm)
        print('After', counter)
        
        # Predicciones usando el clasificador de knn y 1nn
        y_pred_knn_sm = knn_classifier(X_train_sm, y_train_sm, X_test,k=k)
        y_pred_1nn_sm = one_nn_classifier(X_train_sm, y_train_sm, X_test)
        
        # Acumular los resultados de prueba y predicción
        y_tests.extend(y_test)
        y_preds_knn.extend(y_pred_knn)
        y_preds_1nn.extend(y_pred_1nn)
        y_preds_knn_sm.extend(y_pred_knn_sm)
        y_preds_1nn_sm.extend(y_pred_1nn_sm)
        

    
    
    return y_tests, y_preds_knn, y_preds_1nn, y_preds_knn_sm, y_preds_1nn_sm

In [139]:
def evaluate_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print("Accuracy:", accuracy)
    print("Matriz de Confusión:\n", conf_matrix)

In [141]:
# Cargar y validar cada dataset
datasets = ['glass.csv']
k = 1

for dataset in datasets:
    data = pd.read_csv(dataset)

    print(f"*******************************************************************************************")
    print(f"\n--- Validación en {dataset} ---")
    
    # Hold-Out 70/30 Estratificado
    y_test, y_preds_knn, y_preds_1nn, y_preds_knn_sm, y_preds_1nn_sm = hold_out_validation(data,k)
    print("\n---- KNN ----\nHold-Out (70/30 Estratificado):  (SIN SMOTE)")
    evaluate_performance(y_test, y_preds_knn)
    print("\n---- 1NN ----\nHold-Out (70/30 Estratificado): (SIN SMOTE)")
    evaluate_performance(y_test, y_preds_1nn)
    print("\n---- KNN ----\nHold-Out (70/30 Estratificado):  (SMOTE)")
    evaluate_performance(y_test, y_preds_knn_sm)
    print("\n---- 1NN ----\nHold-Out (70/30 Estratificado): (SMOTE)")
    evaluate_performance(y_test, y_preds_1nn_sm)
    # Comparar que no sean las mismas listas 
    if y_preds_knn == y_preds_knn_sm: 
        print("Son iguales")
    
    # 10-Fold Cross-Validation Estratificado
    y_test, y_preds_knn, y_preds_1nn, y_preds_knn_sm, y_preds_1nn_sm = k_fold_validation(data,k)
    print("\n---- KNN ----\n10-Fold Cross-Validation Estratificado: (SIN SMOTE)")
    evaluate_performance(y_test, y_preds_knn)
    print("\n---- 1NN ----\n10-Fold Cross-Validation Estratificado: (SIN SMOTE)")
    evaluate_performance(y_test, y_preds_1nn)
    print("\n---- KNN ----\n10-Fold Cross-Validation Estratificado: (SMOTE)")
    evaluate_performance(y_test, y_preds_knn_sm)
    print("\n---- 1NN ----\n10-Fold Cross-Validation Estratificado: (SMOTE)")
    evaluate_performance(y_test, y_preds_1nn_sm)

*******************************************************************************************

--- Validación en glass.csv ---
Before Counter({2: 55, 1: 49, 7: 19, 5: 11, 3: 10, 6: 6})
After SMOTE: Counter({2: 55, 6: 55, 1: 49, 7: 19, 5: 11, 3: 10})


ValueError: operands could not be broadcast together with shapes (150,) (199,) 