In [1]:
import sys
sys.path.append("../src")

In [2]:
import pandas as pd
from preprocess import min_max_scaler

# Cargar el dataset Glass
glass_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
glass_columns = ["Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Type"]
glass_df = pd.read_csv(glass_url, names=glass_columns, header=None)
glass_features = glass_df.drop(columns=["Id", "Type"])
glass_target = glass_df["Type"]

# Normalizar el Glass Dataset
glass_features_normalized = min_max_scaler(glass_features.values.tolist())
glass_data_normalized = pd.DataFrame(glass_features_normalized, columns=glass_features.columns)
glass_data_normalized["Type"] = glass_target

# Cargar el dataset Iris
iris_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
iris_columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Class"]
iris_df = pd.read_csv(iris_url, names=iris_columns, header=None)
iris_features = iris_df.drop(columns=["Class"])
iris_target = iris_df["Class"].map({
    "Iris-setosa": 0,
    "Iris-versicolor": 1,
    "Iris-virginica": 2
})

# Normalizar el Iris Dataset
iris_features_normalized = min_max_scaler(iris_features.values.tolist())
iris_data_normalized = pd.DataFrame(iris_features_normalized, columns=iris_features.columns)
iris_data_normalized["Class"] = iris_target

# Cargar el dataset Wine
wine_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine_columns = ["Class", "Alcohol", "MalicAcid", "Ash", "Alcalinity", "Magnesium", "Phenols",
                "Flavanoids", "NonFlavanoids", "Proanthocyanins", "Color", "Hue",
                "Dilution", "Proline"]
wine_df = pd.read_csv(wine_url, names=wine_columns, header=None)
wine_features = wine_df.drop(columns=["Class"])
wine_target = wine_df["Class"]

# Normalizar el Wine Dataset
wine_features_normalized = min_max_scaler(wine_features.values.tolist())
wine_data_normalized = pd.DataFrame(wine_features_normalized, columns=wine_features.columns)
wine_data_normalized["Class"] = wine_target

# Verificar los datasets normalizados
print("Glass Dataset Preprocesado:")
print(glass_data_normalized.head())

print("\nIris Dataset Preprocesado:")
print(iris_data_normalized.head())

print("\nWine Dataset Preprocesado:")
print(wine_data_normalized.head())

Glass Dataset Preprocesado:
         RI        Na        Mg        Al        Si         K        Ca   Ba  \
0  0.432836  0.437594  1.000000  0.252336  0.351786  0.009662  0.308550  0.0   
1  0.283582  0.475188  0.801782  0.333333  0.521429  0.077295  0.223048  0.0   
2  0.220808  0.421053  0.790646  0.389408  0.567857  0.062802  0.218401  0.0   
3  0.285777  0.372932  0.821826  0.311526  0.500000  0.091787  0.259294  0.0   
4  0.275241  0.381955  0.806236  0.295950  0.583929  0.088567  0.245353  0.0   

    Fe  Type  
0  0.0     1  
1  0.0     1  
2  0.0     1  
3  0.0     1  
4  0.0     1  

Iris Dataset Preprocesado:
   SepalLength  SepalWidth  PetalLength  PetalWidth  Class
0     0.222222    0.625000     0.067797    0.041667      0
1     0.166667    0.416667     0.067797    0.041667      0
2     0.111111    0.500000     0.050847    0.041667      0
3     0.083333    0.458333     0.084746    0.041667      0
4     0.194444    0.666667     0.067797    0.041667      0

Wine Dataset Prepr

In [6]:
from knn import KNNClassifier
from validation import hold_out, k_fold_cross_validation, leave_one_out
from check import accuracy_score, confusion_matrix_binary
from collections import Counter
# Configuración inicial
k_values = [1, 3, 5, 7]
validation_methods = ["hold_out", "k_fold", "leave_one_out"]
results = {}

# Evaluar KNN en cada dataset con diferentes métodos de validación
for dataset_name, (X, y) in {"Glass": (glass_features_normalized, glass_target),
                             "Iris": (iris_features_normalized, iris_target),
                             "Wine": (wine_features_normalized, wine_target)}.items():
    print(f"Dataset: {dataset_name}")
    results[dataset_name] = {}

    for method in validation_methods:
        print(f"  Método de Validación: {method}")
        results[dataset_name][method] = []

        for k in k_values:
            knn = KNNClassifier(k=k)
            accuracies = []
            confusion_matrices = []

            if method == "hold_out":
                X_train, X_test, y_train, y_test = hold_out(X, y, test_size=0.3, stratify=True)
                knn.fit(X_train, y_train)
                y_pred = knn.predict(X_test)
                accuracies.append(accuracy_score(y_test, y_pred))
                confusion_matrices.append(confusion_matrix_binary(y_test, y_pred))

            elif method == "k_fold":
                folds = k_fold_cross_validation(X, y, k=10, stratify=True)
                for X_train, X_val, y_train, y_val in folds:
                    knn.fit(X_train, y_train)
                    y_pred = knn.predict(X_val)
                    accuracies.append(accuracy_score(y_val, y_pred))
                    confusion_matrices.append(confusion_matrix_binary(y_val, y_pred))

            elif method == "leave_one_out":
                folds = leave_one_out(X, y)
                for X_train, X_val, y_train, y_val in folds:
                    knn.fit(X_train, y_train)
                    y_pred = knn.predict(X_val)
                    accuracies.append(accuracy_score(y_val, y_pred))
                    confusion_matrices.append(confusion_matrix_binary(y_val, y_pred))

            avg_accuracy = sum(accuracies) / len(accuracies)
            results[dataset_name][method].append({
                "k": k,
                "avg_accuracy": avg_accuracy,
                "confusion_matrices": confusion_matrices
            })

            print(f"    k={k}: Avg Accuracy={avg_accuracy}")

# Mostrar resultados finales
for dataset_name, methods in results.items():
    print(f"Resultados para el dataset {dataset_name}:")
    for method_name, method_results in methods.items():
        print(f"  Método: {method_name}")
        for result in method_results:
            print(f"    k={result['k']}, Avg Accuracy={result['avg_accuracy']}")
            print(f"    Confusion Matrices: {result['confusion_matrices']}")


Dataset: Glass
  Método de Validación: hold_out
    k=1: Avg Accuracy=0.6721311475409836
    k=3: Avg Accuracy=0.7213114754098361
    k=5: Avg Accuracy=0.7049180327868853
    k=7: Avg Accuracy=0.6065573770491803
  Método de Validación: k_fold
    k=1: Avg Accuracy=0.7055555555555555
    k=3: Avg Accuracy=0.7222222222222222
    k=5: Avg Accuracy=0.6444444444444445
    k=7: Avg Accuracy=0.6833333333333333
  Método de Validación: leave_one_out
    k=1: Avg Accuracy=0.7009345794392523
    k=3: Avg Accuracy=0.7009345794392523
    k=5: Avg Accuracy=0.6682242990654206
    k=7: Avg Accuracy=0.6308411214953271
Dataset: Iris
  Método de Validación: hold_out
    k=1: Avg Accuracy=0.9777777777777777
    k=3: Avg Accuracy=0.9555555555555556
    k=5: Avg Accuracy=0.9555555555555556
    k=7: Avg Accuracy=0.9777777777777777
  Método de Validación: k_fold
    k=1: Avg Accuracy=0.9533333333333334
    k=3: Avg Accuracy=0.9466666666666667
    k=5: Avg Accuracy=0.96
    k=7: Avg Accuracy=0.96
  Método de V