# Importar librerias y leer datos

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [None]:
data = pd.read_csv("./archivos-bd/data_uci.csv", delimiter=";")

# Análisis exploratorio de datos

In [None]:
# Primer contacto con los datos
data.head()

In [None]:
# Forma y cantidad de los datos
data.shape

In [None]:
# Ver balance de las etiquetas
print(data["riesgo"].value_counts())

sns.countplot(x=data["riesgo"])

In [None]:
# Columnas del dataset
data.columns

In [None]:
# Correlacion entre variables
corr = data.iloc[:, :5].corr()

# Grafica con Seaborn de la matriz de correlación
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(15, 4))
fig.suptitle('Distribucion de los datos')
sns.histplot(x=data["edad"], ax=ax[0])
ax[0].set_title('edad')
sns.histplot(x=data["td"], ax=ax[1])
ax[1].set_title('td')
sns.histplot(x=data["par"], ax=ax[2])
ax[2].set_title('par')
sns.histplot(x=data["fcm"], ax=ax[3])
ax[3].set_title('fcm')
sns.histplot(x=data["col"], ax=ax[4])
ax[4].set_title('col')
plt.tight_layout()

In [None]:
# Cantidad de valores nulos
data.isnull().sum()

In [None]:
# Analisis de valores por columna para determinar si había valores extraños

print("Edad:",data["edad"].unique())

print("Dolor toraxico:", data["td"].unique())

print("Presion arterial reposo:",data["par"].unique())

print("Colesterol", data["col"].unique())

print("Frecuencia cardiaca maxima:", data["fcm"].unique())

print("Riesgo:", data["riesgo"].unique())

# Preprocesamiento y normalización de los datos

In [None]:
# Datos con Frecuencia Cardíaca igual a 0
data[data["fcm"] == 0]

In [None]:
# Datos con Presion Arterial en Reposo igual a 0
data[data["par"] == -1]

In [None]:
# Reemplazo de valores iguales a 0 en Frecuencia Cardíaca por la media de la columna
mediaFcm = int(data[data["fcm"] != 0]["fcm"].mean())

data["fcm"] = data["fcm"].replace(0, mediaFcm)

print(data[data["fcm"] == 0])

data["fcm"].unique()

In [None]:
# Reemplazo de valores iguales a -1 en Presión Arterial en Reposo por la media de la columna
mediaFcm = int(data[data["par"] != -1]["par"].mean())

data["par"] = data["par"].replace(-1, mediaFcm)

print(data[data["par"] == -1])

data["par"].unique()

In [None]:
# Información sobre las medias, distribución, dispersiones y otros atributos estadísticos de los datos
sns.boxplot(data)

## Observacion

Encontramos que hay un valor de colesterol que esta completametne alejado de los valores posibles ya que es más del doble del nivel de colesterol aceptado.
La decisión que vamos a tomar es la de reemplazar el valor por el valor de colesterol medio porque pensamos que este valor tiene un error de medicion. Esto también lo pensamos porque los valores de los otros atributos que acompañan a este dato son adecuados y tiene un riesgo de cardiopatia bajo.

In [None]:
# Busqueda del valor atípico
data[data["col"] >= 500]

In [None]:
# Reemplazo de valores atípicos en Colesterol por la media de la columna
mediaCol = int(data[data["col"] != 564]["col"].mean())

data["col"] = data["col"].replace(564, mediaCol)

print(data[data["col"] == 564])

In [None]:
# Balanceo del conjunto de datos
oversample = RandomOverSampler(sampling_strategy="minority")

data_sin_riesgo = data.iloc[:, :5]
data_riesgo = data["riesgo"]

data_sin_riesgo_over, data_riesgo_over = oversample.fit_resample(data_sin_riesgo, data_riesgo)

sns.countplot(x=data_riesgo_over)

In [None]:
# Normalización del dataset
normalized_df=(data_sin_riesgo_over-data_sin_riesgo_over.min())/(data_sin_riesgo_over.max()-data_sin_riesgo_over.min())

sns.boxplot(normalized_df)

In [None]:
# Muestra del dataset normalizado
normalized_df

In [None]:
# Transformación de variables categóricas a numéricas
data_riesgo_over = data_riesgo_over.map({'alto':1,'bajo':0})
data_riesgo_over

# Entrenamiento del modelo

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from scipy.stats import randint

In [None]:
# División del conjunto de datos
x = normalized_df.copy()
y = data_riesgo_over.copy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)

print(x.shape, x_train.shape, x_test.shape)
print(y.shape, y_train.shape, y_test.shape)

## Regresión Logística

In [None]:
# Instanciar un modelo de Regresión Logística
model_log_reg = LogisticRegression(verbose=2)

# Entrenar el modelo con el conjunto de entrenamiento
model_log_reg.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_log_reg.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_log_reg.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

## Regresión Logística con regularización Lasso

In [None]:
# Instanciar un modelo de Regresión Logística
model_log_reg = LogisticRegression(penalty="l1", C=0.5, verbose=2, solver="liblinear")

# Entrenar el modelo con el conjunto de entrenamiento
model_log_reg.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_log_reg.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_log_reg.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

## Naive-Bayes

In [None]:
# Instanciamos un modelo Naive Bayes
model_NB = GaussianNB()

# Entrenar el modelo con el conjunto de entrenamiento
model_NB.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_NB.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_NB.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

## K-Nearest-Neighbors

In [None]:
# Parámetros que probaremos cambiar
k_range = [i for i in range(1, 21)]
param_grid = dict(n_neighbors=k_range)

# Instanciamos un GridSearch con validacion cruzada
grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=7, scoring="recall")

grid_search = grid.fit(x_train, y_train)

print("Score en el conjunto de tests: {:.2f}".format(grid_search.score(x_test, y_test)))
print("Mejores parámetros: {}".format(grid_search.best_params_))
print("Mejor Score de validacion Cruzada: {:.2f}".format(grid_search.best_score_))

In [None]:
# Instanciamos un modelo KNN
model_KNN = KNeighborsClassifier(grid_search.best_params_["n_neighbors"])

# Entrenar el modelo con el conjunto de entrenamiento
model_KNN.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_KNN.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_KNN.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

## Random Forest Classifier

In [None]:
# Instanciamos un modelo RandomForest
model_RFC = RandomForestClassifier(n_estimators=150)

# Entrenar el modelo con el conjunto de entrenamiento
model_RFC.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_RFC.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_RFC.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

In [None]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Usar random search para encontrar los mejores hiperparámetros
rand_search = RandomizedSearchCV(RandomForestClassifier(), 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

# Entrenar random search
rand_search.fit(x_train, y_train)

# Crear una variable con el mejor modelo
best_rf = rand_search.best_estimator_

# Mostrar los mejores hiperparámetros
print('Best hyperparameters:',  rand_search.best_params_)

# Entrenar el modelo con el conjunto de entrenamiento
best_rf.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = best_rf.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = best_rf.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')

## Perceptrón Multicapa

In [None]:
# Instanciar un modelo de perceptron multicapa
model_MLP = MLPClassifier(hidden_layer_sizes=(50, 100, 50), activation="relu", max_iter=1500, alpha=2, solver="lbfgs", verbose=0, random_state=2, tol=0.000000001, learning_rate="adaptive")

# Entrenar el modelo con el conjunto de entrenamiento
model_MLP.fit(x_train, y_train)

# Calculando la precisión para el conjunto de entrenamiento
x_train_prediction = model_MLP.predict(x_train)
train_accuracy = accuracy_score(x_train_prediction, y_train)
train_recall = recall_score(x_train_prediction, y_train)
train_f1 = f1_score(x_train_prediction, y_train)
print(f'Entrenamiento - Accuracy: {train_accuracy:.2f}')
print(f'Entrenamiento - Recall: {train_recall:.2f}')
print(f'Entrenamiento - F1-Score: {train_f1:.2f}')

# Calculando la precisión para el conjunto de test
x_test_prediction = model_MLP.predict(x_test)
test_accuracy = accuracy_score(x_test_prediction, y_test)
test_recall = recall_score(x_test_prediction, y_test)
test_f1 = f1_score(x_test_prediction, y_test)
print(f'Test - Accuracy: {test_accuracy:.2f}')
print(f'Test - Recall: {test_recall:.2f}')
print(f'Test - F1-Score: {test_f1:.2f}')