# Práctica 5: Meta aprendizaje
## Dataset: <a src="https://www.kaggle.com/datasets/sooyoungher/smoking-drinking-dataset/">Smoking and Drinking Dataset with body signal</a>

### Contexto

En la practica anterior generamos y analizamos 10 modelos de clasificación, utilizando validación cruzada y calculando su sensibilidad y espesificiadad. Esto con el objetivo de obtener el mejor modelo para clasificar si una persona es bebedora o no. Los 2 mejores modelos fueron los siguientes:

<table>
    <thead>
        <tr>
            <th colspan=4>Sin PCA</th>
        </tr>
        <tr>
            <th>Modelo</th>
            <th>Exactitud</th>
            <th>Sencibilidad</th>
            <th>Especificidad</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>Neural Net</td>
            <td>72.300000</td>
            <td>72.933333</td>
            <td>71.433333</td>
        </tr>
        <tr>
            <td>Gaussian Process</td>
            <td>72.150000</td>
            <td>71.466667</td>
            <td>72.833333</td>
        </tr>
    </tbody>
</table>

### Importando Librerias

In [16]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from numpy import mean
from matplotlib import pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

### Pre-procesado de datos

#### Limpieza

In [2]:
df = pd.read_csv(f'smoking_driking_dataset_Ver01.csv')
df = df.drop_duplicates()
df.loc[:, ["hear_left", "hear_right", "urine_protein", "SMK_stat_type_cd"]] = df[
    ["hear_left", "hear_right", "urine_protein", "SMK_stat_type_cd"]].astype("category")
df["DRK_YN"] = df["DRK_YN"].astype("category")
df["sex"] = df["sex"].astype("category")  
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991320 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   sex               991320 non-null  category
 1   age               991320 non-null  int64   
 2   height            991320 non-null  int64   
 3   weight            991320 non-null  int64   
 4   waistline         991320 non-null  float64 
 5   sight_left        991320 non-null  float64 
 6   sight_right       991320 non-null  float64 
 7   hear_left         991320 non-null  category
 8   hear_right        991320 non-null  category
 9   SBP               991320 non-null  float64 
 10  DBP               991320 non-null  float64 
 11  BLDS              991320 non-null  float64 
 12  tot_chole         991320 non-null  float64 
 13  HDL_chole         991320 non-null  float64 
 14  LDL_chole         991320 non-null  float64 
 15  triglyceride      991320 non-null  float64 
 16  hemoglo

#### Ingenieria de Caracteristicas

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ["sex","hear_right","hear_left", "DRK_YN"]:
    df.loc[:, [col]] = le.fit_transform(df[[col]])

from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder()
for col in ["urine_protein","SMK_stat_type_cd"]:
    df.loc[:, [col]] = ordinal.fit_transform(df[[col]])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for col in ["age","height"]:
    df.loc[:, [col]] = scaler.fit_transform(df[[col]])

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
columns_to_encode = ["urine_protein", "SMK_stat_type_cd"]
df[columns_to_encode] = encoder.fit_transform(df[columns_to_encode])

from sklearn.preprocessing import RobustScaler
robust = RobustScaler()
columnas = ['weight', 'waistline', 'sight_left', 'sight_right','SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'hemoglobin','serum_creatinine', 'SGOT_AST', 'SGOT_ALT', 'gamma_GTP']
for col in columnas:
    df.loc[:, [col]] = robust.fit_transform(df[[col]])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#### Separando la variable objetivo

In [4]:
df_Y = df.loc[df["DRK_YN"]==1]
df_Y = df_Y[:3000]
df_N = df.loc[df["DRK_YN"]==0]
df_N = df_N[:3000]
dataset = pd.concat([df_Y, df_N],axis=0)
dataset = dataset.sample(frac=1).reset_index(drop=True)
Y = dataset["DRK_YN"]  # Variable objetivo
X = dataset.drop(columns=["DRK_YN"])  # Características

### Definiendo los algoritmos de inferencia

In [5]:
clf1 = DecisionTreeClassifier()
clf2 = GaussianNB()
clf3 = KNeighborsClassifier(n_neighbors=3)
clf4 = svm.SVC()
lr = LogisticRegression()

In [15]:
nombres = [
    "Gaussian Process",
    "Neural Net",
    "AdaBoost",
    "Random Forest",
    "Boosting",
    "Bagging",
    "Stacking", 
    "Voting"
]

modelos = [
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(n_estimators=10, random_state=0),
    RandomForestClassifier(n_estimators=10, random_state=0),
    GradientBoostingClassifier(n_estimators=10),
    BaggingClassifier(n_estimators=10),
    StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=lr, random_state=42),
    VotingClassifier(estimators=[('dt', clf1), ('nb', clf2), ('kNN', clf3), ('svm', clf4)],voting='hard')
]

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

### Experimento 1

In [17]:
metricas_df = pd.DataFrame(columns=['Modelo', 'Exactitud', 'Sensibilidad', 'Especificidad'])
for nombre, modelo in zip(nombres, modelos):
    scores = []
    sensibilidades = []
    especificidades = []
    matrices_de_confusion = []
    for train_index, test_index in sss.split(X, Y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_test)
        scores.append(accuracy_score(y_test, y_pred))
        conf_matrix = confusion_matrix(y_test, y_pred)
        matrices_de_confusion.append(conf_matrix)
        # Calcular sensibilidad y especificidad
        sensibilidad = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
        especificidad = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
        sensibilidades.append(sensibilidad)
        especificidades.append(especificidad)
    
    matriz_confusion_promedio = sum(matrices_de_confusion) // len(matrices_de_confusion)
    sensibilidad_promedio = mean(sensibilidades)
    especificidad_promedio = mean(especificidades)
    exactitud_promedio = mean(scores)
    
    # Agregar métricas al DataFrame
    metrics = {
        'Modelo': nombre,
        'Exactitud': exactitud_promedio * 100,
        'Sensibilidad': sensibilidad_promedio * 100,
        'Especificidad': especificidad_promedio * 100
    }
    
    metricas_df = pd.concat([metricas_df, pd.DataFrame(metrics, index=[0])], ignore_index=True)
    
    average_confusion_np = np.array(matriz_confusion_promedio)
    plt.figure(figsize=(6, 4))
    sns.heatmap(average_confusion_np, annot=True, cmap='Blues', fmt='d', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Matriz de Confusión Promedio de '+nombre)
    plt.show()

metricas_df

KeyboardInterrupt: 