In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Métodos de Validación
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# Modelos de Clasificación
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Métricas para Clasificación
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Heart Attack.csv")

In [4]:
df

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.80,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.060,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,negative
1315,66,1,84,125,55,149.0,1.33,0.172,positive
1316,45,1,85,168,104,96.0,1.24,4.250,positive
1317,54,1,58,117,68,443.0,5.80,0.359,positive


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   impluse        1319 non-null   int64  
 3   pressurehight  1319 non-null   int64  
 4   pressurelow    1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   class          1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [6]:
df.describe()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.191812,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942
std,13.647315,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3


In [7]:
class_labelEncoding = LabelEncoder()
clase = class_labelEncoding.fit_transform(df["class"].values)
df["class"] = clase

In [8]:
df

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,0
1315,66,1,84,125,55,149.0,1.33,0.172,1
1316,45,1,85,168,104,96.0,1.24,4.250,1
1317,54,1,58,117,68,443.0,5.80,0.359,1


In [9]:
df["class"].value_counts()

1    810
0    509
Name: class, dtype: int64

In [10]:
X = df.drop("class", axis=1)
y = np.array(df["class"])

In [11]:
X.shape, y.shape

((1319, 8), (1319,))

In [12]:
x_scaler = MinMaxScaler()

X = x_scaler.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 45)

print(f"Conjunto de Train: {X_train.shape, y_train.shape}")
print(f"Conjunto de Test: {X_test.shape, y_test.shape}")

Conjunto de Train: ((989, 8), (989,))
Conjunto de Test: ((330, 8), (330,))


In [14]:
models = [KNeighborsClassifier(), NearestCentroid(), GaussianNB(), LogisticRegression(),
         DecisionTreeClassifier(), RandomForestClassifier(), SVC(), GradientBoostingClassifier(), AdaBoostClassifier()]

In [15]:
datos_metricas = list()

for model in models:
    model.fit(X_train, y_train.ravel())
    
    yhat = model.predict(X_test)
    
    Jaccard_Index = jaccard_score(y_test, yhat, average = "macro")
    Exactitud = accuracy_score(y_test, yhat)
    Precision = precision_score(y_test, yhat, average = "macro")
    Sensibilidad = recall_score(y_test, yhat, average = "macro")
    F1_score = f1_score(y_test, yhat, average = "macro")
    Roc_auc = roc_auc_score(y_test, yhat, average = "macro")
    
    datos_metricas.append([model, str(model).rstrip("()"), Jaccard_Index, Exactitud, Precision, Sensibilidad, F1_score, Roc_auc])
    
df_metricas = pd.DataFrame(data = datos_metricas, columns = ["Model", "Modelo", "Jaccard_Index", "Exactitud", "Precision", 
                                                             "Sensibilidad", "F1_score", "Roc_auc"])

In [16]:
mejor_modelo = df_metricas.sort_values("F1_score", ascending = False)["Model"].iloc[0]

mejor_modelo

In [17]:
df_metricas.drop("Model", axis = 1, inplace = True)
df_metricas.sort_values("F1_score", ascending = False)

Unnamed: 0,Modelo,Jaccard_Index,Exactitud,Precision,Sensibilidad,F1_score,Roc_auc
7,GradientBoostingClassifier,0.987041,0.993939,0.993475,0.993475,0.993475,0.993475
8,AdaBoostClassifier,0.987041,0.993939,0.993475,0.993475,0.993475,0.993475
5,RandomForestClassifier,0.980662,0.990909,0.989399,0.991083,0.99023,0.991083
4,DecisionTreeClassifier,0.974173,0.987879,0.988689,0.985211,0.986905,0.985211
6,SVC,0.548387,0.736364,0.719663,0.692692,0.699922,0.692692
3,LogisticRegression,0.53548,0.736364,0.731921,0.677033,0.685235,0.677033
2,GaussianNB,0.495226,0.663636,0.760776,0.73445,0.661994,0.73445
0,KNeighborsClassifier,0.484624,0.666667,0.646311,0.651588,0.647997,0.651588
1,NearestCentroid,0.390611,0.593939,0.551046,0.547194,0.547223,0.547194


In [18]:
# GridSearch 

model = mejor_modelo

params = {"loss" : ["log_loss", "exponencial"],
          "criterion" : ["friedman_mse", "squared_error", "mse", "mae"],
          "max_features" : ["auto", "sqrt", "log2"],
         "n_estimators": [100, 150, 200, 250, 300]} 

scorers = {"f1_macro", "accuracy", "recall_macro", "roc_auc" }

grid_solver = GridSearchCV(estimator  = model     , 
                           param_grid = params    , 
                           scoring    = scorers   ,
                           cv         = 5         ,
                           refit      = "accuracy",
                           n_jobs     = -1        ,
                           verbose    = 3)

model_result = grid_solver.fit(X, y.ravel())

print("*"*100)

print(model_result.best_score_)
print(model_result.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
****************************************************************************************************
0.9886306026039865
{'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': 'log2', 'n_estimators': 300}


In [19]:
# Stratified k-Fold

skfold = StratifiedKFold(n_splits = 10)
y_test_real, yhat = list(), list()


for train_index, test_index in skfold.split(X, y): 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Modelo
    modelo = mejor_modelo
    modelo.fit(X_train, y_train)
    
    # Prediccion
    yhat1 = modelo.predict(X_test)
    yhat.extend(yhat1)
    
    # Valores reales
    y_test_real.extend(y_test)

print("Accuracy:", accuracy_score(y_test_real, yhat)) 

Accuracy: 0.9855951478392722
