In [48]:
import pandas as pd                  # A fundamental package for linear algebra and multidimensional arrays
import numpy as np                   # Data analysis and data manipulating tool
import random                        # Library to generate random numbers
from collections import Counter      # Collection is a Python module that implements specialized container datatypes providing
                                     # alternatives to Python’s general purpose built-in containers, dict, list, set, and tuple.
                                     # Counter is a dict subclass for counting hashable objects
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# To ignore warnings in the notebook
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import LeaveOneOut, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix

In [49]:
# This is a subset of the original data available at kaggle.
data = pd.read_csv("C:\\Users\\Polar\\Documents\\ESCUELA\\5TO_SEMESTRE\\MAKINITAS\\practica6\\iris\\iris.csv")

data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [52]:
# Codificar la columna 'class' como valores numéricos
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

# Separar características y etiquetas
X = data.drop(columns=['class']).values
y = data['class'].values

# Función para calcular la distancia Euclidiana
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

# Función para clasificar un nuevo punto usando el algoritmo de distancia mínima
def classify_min_distance(new_point, centroids):
    min_distance = float('inf')
    closest_class = None
    
    for species, centroid in centroids.iterrows():
        distance = euclidean_distance(new_point, centroid)
        if distance < min_distance:
            min_distance = distance
            closest_class = species
    return closest_class

# --- Leave-One-Out Cross-Validation ---
loo = LeaveOneOut()
correct_predictions_loo = 0
y_pred_loo = []

for train_index, test_index in loo.split(X):
    # Dividir datos en entrenamiento y prueba
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['class'] = y_train
    centroids = train_data.groupby('class').mean()
    
    # Clasificar el punto de prueba
    predicted_class = classify_min_distance(X_test[0], centroids)
    y_pred_loo.append(predicted_class)
    
    # Comparar con la clase real
    if predicted_class == y_test[0]:
        correct_predictions_loo += 1

# Calcular y mostrar la precisión de Leave-One-Out
accuracy_loo = correct_predictions_loo / len(X)
print(f"Precisión de Leave-One-Out Cross-Validation: {accuracy_loo:.2f}")

# Imprimir la matriz de confusión para Leave-One-Out
print("Matriz de Confusión Leave-One-Out:")
print(confusion_matrix(y, y_pred_loo))

# --- Hold-Out Validation (70-30 split) ---
# Dividir en entrenamiento y prueba con un 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calcular centroides con el conjunto de entrenamiento
train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
train_data['class'] = y_train
centroids = train_data.groupby('class').mean()

# Validar en el conjunto de prueba
correct_predictions_holdout = 0
y_pred_holdout = []

for i, test_point in enumerate(X_test):
    predicted_class = classify_min_distance(test_point, centroids)
    y_pred_holdout.append(predicted_class)
    if predicted_class == y_test[i]:
        correct_predictions_holdout += 1

# Calcular y mostrar la precisión de Hold-Out
accuracy_holdout = correct_predictions_holdout / len(X_test)
print(f"Precisión de Hold-Out Validation (70-30): {accuracy_holdout:.2f}")

# Imprimir la matriz de confusión para Hold-Out
print("Matriz de Confusión Hold-Out:")
print(confusion_matrix(y_test, y_pred_holdout))

# --- 10-Fold Cross-Validation Estratificado ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
correct_predictions_kfold = 0
total_test_samples = 0
y_pred_kfold = []

for train_index, test_index in skf.split(X, y):
    # Dividir datos en entrenamiento y prueba para el fold actual
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento del fold actual
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['class'] = y_train
    centroids = train_data.groupby('class').mean()
    
    # Validar en el conjunto de prueba
    for i, test_point in enumerate(X_test):
        predicted_class = classify_min_distance(test_point, centroids)
        y_pred_kfold.append(predicted_class)
        if predicted_class == y_test[i]:
            correct_predictions_kfold += 1
    
    # Actualizar el conteo de muestras de prueba
    total_test_samples += len(X_test)

# Calcular y mostrar la precisión de 10-Fold Cross-Validation
accuracy_kfold = correct_predictions_kfold / total_test_samples
print(f"Precisión de 10-Fold Cross-Validation Estratificado: {accuracy_kfold:.2f}")

# Imprimir la matriz de confusión para 10-Fold Cross-Validation
print("Matriz de Confusión 10-Fold Cross-Validation:")
print(confusion_matrix(y, y_pred_kfold))

Precisión de Leave-One-Out Cross-Validation: 0.92
Matriz de Confusión Leave-One-Out:
[[50  0  0]
 [ 0 45  5]
 [ 0  7 43]]
Precisión de Hold-Out Validation (70-30): 0.96
Matriz de Confusión Hold-Out:
[[19  0  0]
 [ 0 11  2]
 [ 0  0 13]]
Precisión de 10-Fold Cross-Validation Estratificado: 0.92
Matriz de Confusión 10-Fold Cross-Validation:
[[20 16 14]
 [15 20 15]
 [15 16 19]]


In [53]:
# This is a subset of the original data available at kaggle.
data = pd.read_csv("C:\\Users\\Polar\\Documents\\ESCUELA\\5TO_SEMESTRE\\MAKINITAS\\practica6\\winequality\\winequality-red.csv", delimiter=";")

data.head()
print(data.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [54]:
# Codificar la columna 'class' como valores numéricos
label_encoder = LabelEncoder()
data['quality'] = label_encoder.fit_transform(data['quality'])

# Separar características y etiquetas
X = data.drop(columns=['quality']).values
y = data['quality'].values

# Función para calcular la distancia Euclidiana
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

# Función para clasificar un nuevo punto usando el algoritmo de distancia mínima
def classify_min_distance(new_point, centroids):
    min_distance = float('inf')
    closest_class = None
    
    for species, centroid in centroids.iterrows():
        distance = euclidean_distance(new_point, centroid)
        if distance < min_distance:
            min_distance = distance
            closest_class = species
    return closest_class

# --- Leave-One-Out Cross-Validation ---
loo = LeaveOneOut()
correct_predictions_loo = 0
y_pred_loo = []

for train_index, test_index in loo.split(X):
    # Dividir datos en entrenamiento y prueba
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['quality'] = y_train
    centroids = train_data.groupby('quality').mean()
    
    # Clasificar el punto de prueba
    predicted_class = classify_min_distance(X_test[0], centroids)
    y_pred_loo.append(predicted_class)
    
    # Comparar con la clase real
    if predicted_class == y_test[0]:
        correct_predictions_loo += 1

# Calcular y mostrar la precisión de Leave-One-Out
accuracy_loo = correct_predictions_loo / len(X)
print(f"Precisión de Leave-One-Out Cross-Validation: {accuracy_loo:.2f}")

# Imprimir la matriz de confusión para Leave-One-Out
print("Matriz de Confusión Leave-One-Out:")
print(confusion_matrix(y, y_pred_loo))

# --- Hold-Out Validation (70-30 split) ---
# Dividir en entrenamiento y prueba con un 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calcular centroides con el conjunto de entrenamiento
train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
train_data['quality'] = y_train
centroids = train_data.groupby('quality').mean()

# Validar en el conjunto de prueba
correct_predictions_holdout = 0
y_pred_holdout = []

for i, test_point in enumerate(X_test):
    predicted_class = classify_min_distance(test_point, centroids)
    y_pred_holdout.append(predicted_class)
    if predicted_class == y_test[i]:
        correct_predictions_holdout += 1

# Calcular y mostrar la precisión de Hold-Out
accuracy_holdout = correct_predictions_holdout / len(X_test)
print(f"Precisión de Hold-Out Validation (70-30): {accuracy_holdout:.2f}")

# Imprimir la matriz de confusión para Hold-Out
print("Matriz de Confusión Hold-Out:")
print(confusion_matrix(y_test, y_pred_holdout))

# --- 10-Fold Cross-Validation Estratificado ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
correct_predictions_kfold = 0
total_test_samples = 0
y_pred_kfold = []

for train_index, test_index in skf.split(X, y):
    # Dividir datos en entrenamiento y prueba para el fold actual
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento del fold actual
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['quality'] = y_train
    centroids = train_data.groupby('quality').mean()
    
    # Validar en el conjunto de prueba
    for i, test_point in enumerate(X_test):
        predicted_class = classify_min_distance(test_point, centroids)
        y_pred_kfold.append(predicted_class)
        if predicted_class == y_test[i]:
            correct_predictions_kfold += 1
    
    # Actualizar el conteo de muestras de prueba
    total_test_samples += len(X_test)

# Calcular y mostrar la precisión de 10-Fold Cross-Validation
accuracy_kfold = correct_predictions_kfold / total_test_samples
print(f"Precisión de 10-Fold Cross-Validation Estratificado: {accuracy_kfold:.2f}")

# Imprimir la matriz de confusión para 10-Fold Cross-Validation
print("Matriz de Confusión 10-Fold Cross-Validation:")
print(confusion_matrix(y, y_pred_kfold))

Precisión de Leave-One-Out Cross-Validation: 0.27
Matriz de Confusión Leave-One-Out:
[[  6   0   2   1   0   1]
 [ 28   0  15   5   2   3]
 [208  43 330  67  12  21]
 [254  32 190  97  24  41]
 [114   6  38  28   4   9]
 [ 11   0   4   1   1   1]]
Precisión de Hold-Out Validation (70-30): 0.24
Matriz de Confusión Hold-Out:
[[ 1  0  0  0  0  0]
 [10  1  2  1  0  3]
 [64 12 96  9  7  7]
 [75 19 58 17 16 15]
 [38  1 12  8  0  2]
 [ 3  0  2  0  0  1]]
Precisión de 10-Fold Cross-Validation Estratificado: 0.28
Matriz de Confusión 10-Fold Cross-Validation:
[[  6   0   3   0   1   0]
 [ 23   2  14   8   2   4]
 [252  32 242  86  34  35]
 [241  39 227  79  26  26]
 [ 67   7  88  22   4  11]
 [  6   1   8   2   0   1]]


In [55]:
# This is a subset of the original data available at kaggle.
data = pd.read_csv("C:\\Users\\Polar\\Documents\\ESCUELA\\5TO_SEMESTRE\\MAKINITAS\\practica6\\winequality\\winequality-white.csv", delimiter=";")

data.head()
print(data.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [56]:
# Codificar la columna 'class' como valores numéricos
label_encoder = LabelEncoder()
data['quality'] = label_encoder.fit_transform(data['quality'])

# Separar características y etiquetas
X = data.drop(columns=['quality']).values
y = data['quality'].values

# Función para calcular la distancia Euclidiana
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

# Función para clasificar un nuevo punto usando el algoritmo de distancia mínima
def classify_min_distance(new_point, centroids):
    min_distance = float('inf')
    closest_class = None
    
    for species, centroid in centroids.iterrows():
        distance = euclidean_distance(new_point, centroid)
        if distance < min_distance:
            min_distance = distance
            closest_class = species
    return closest_class

# --- Leave-One-Out Cross-Validation ---
loo = LeaveOneOut()
correct_predictions_loo = 0
y_pred_loo = []

for train_index, test_index in loo.split(X):
    # Dividir datos en entrenamiento y prueba
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['quality'] = y_train
    centroids = train_data.groupby('quality').mean()
    
    # Clasificar el punto de prueba
    predicted_class = classify_min_distance(X_test[0], centroids)
    y_pred_loo.append(predicted_class)
    
    # Comparar con la clase real
    if predicted_class == y_test[0]:
        correct_predictions_loo += 1

# Calcular y mostrar la precisión de Leave-One-Out
accuracy_loo = correct_predictions_loo / len(X)
print(f"Precisión de Leave-One-Out Cross-Validation: {accuracy_loo:.2f}")

# Imprimir la matriz de confusión para Leave-One-Out
print("Matriz de Confusión Leave-One-Out:")
print(confusion_matrix(y, y_pred_loo))

# --- Hold-Out Validation (70-30 split) ---
# Dividir en entrenamiento y prueba con un 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calcular centroides con el conjunto de entrenamiento
train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
train_data['quality'] = y_train
centroids = train_data.groupby('quality').mean()

# Validar en el conjunto de prueba
correct_predictions_holdout = 0
y_pred_holdout = []

for i, test_point in enumerate(X_test):
    predicted_class = classify_min_distance(test_point, centroids)
    y_pred_holdout.append(predicted_class)
    if predicted_class == y_test[i]:
        correct_predictions_holdout += 1

# Calcular y mostrar la precisión de Hold-Out
accuracy_holdout = correct_predictions_holdout / len(X_test)
print(f"Precisión de Hold-Out Validation (70-30): {accuracy_holdout:.2f}")

# Imprimir la matriz de confusión para Hold-Out
print("Matriz de Confusión Hold-Out:")
print(confusion_matrix(y_test, y_pred_holdout))

# --- 10-Fold Cross-Validation Estratificado ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
correct_predictions_kfold = 0
total_test_samples = 0
y_pred_kfold = []

for train_index, test_index in skf.split(X, y):
    # Dividir datos en entrenamiento y prueba para el fold actual
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Calcular centroides con los datos de entrenamiento del fold actual
    train_data = pd.DataFrame(X_train, columns=data.columns[:-1])
    train_data['quality'] = y_train
    centroids = train_data.groupby('quality').mean()
    
    # Validar en el conjunto de prueba
    for i, test_point in enumerate(X_test):
        predicted_class = classify_min_distance(test_point, centroids)
        y_pred_kfold.append(predicted_class)
        if predicted_class == y_test[i]:
            correct_predictions_kfold += 1
    
    # Actualizar el conteo de muestras de prueba
    total_test_samples += len(X_test)

# Calcular y mostrar la precisión de 10-Fold Cross-Validation
accuracy_kfold = correct_predictions_kfold / total_test_samples
print(f"Precisión de 10-Fold Cross-Validation Estratificado: {accuracy_kfold:.2f}")

# Imprimir la matriz de confusión para 10-Fold Cross-Validation
print("Matriz de Confusión 10-Fold Cross-Validation:")
print(confusion_matrix(y, y_pred_kfold))

Precisión de Leave-One-Out Cross-Validation: 0.10
Matriz de Confusión Leave-One-Out:
[[ 10   3   1   0   0   0   6]
 [ 40  34  17   3   0   1  68]
 [572 182 253 103  15  39 293]
 [607 228 284 169  58  99 753]
 [122  65 103  99  26  74 391]
 [ 24   8  27  10  11  17  78]
 [  0   2   0   1   0   1   1]]
Precisión de Hold-Out Validation (70-30): 0.20
Matriz de Confusión Hold-Out:
[[  0   0   6   0   0   0   1]
 [  0   8  15   3   0   0  14]
 [ 13  56 235  22   7  13  80]
 [ 38  75 251  27  23  28 226]
 [ 22  22  74  15  16  18 113]
 [  2   5   7   1   3   3  28]
 [  0   0   0   0   0   0   0]]
Precisión de 10-Fold Cross-Validation Estratificado: 0.12
Matriz de Confusión 10-Fold Cross-Validation:
[[  5   2   1   3   2   0   7]
 [ 45  28  26   5   6   6  47]
 [359 212 246 117  25  62 436]
 [548 263 385 167  56  99 680]
 [234  80 159  70  27  45 265]
 [ 36  16  34  16   6  11  56]
 [  0   0   0   0   1   1   3]]
