In [1]:
import sys

import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [39]:
def carrega_dados(file_name, separador=',', drop_cols=None):
    data = pd.read_csv(f"../data/{file_name}", sep=separador)
    print("Reorganizando os dados...")
    data = data.sample(frac=1)

    try:
        data = data if not drop_cols else data.drop(drop_cols, axis=1)
    except KeyError:
        print(f"Colunas {drop_cols} não encontradas no DataFrame.")
        sys.exit(1)

    print("Dados carregados")
    return data

def encoding_cols(data, cols):
    print("Transformando colunas categóricas em numéricas...")

    if not data.columns.isin(cols).any():
        print(f"Colunas {cols} não encontradas no DataFrame.")
        raise KeyError(f"Colunas {cols} não encontradas no DataFrame.")
    

    label_encoder = LabelEncoder()

    for col in cols:
        if data[col].dtype == 'object':
            data[col] = label_encoder.fit_transform(data[col])
        else:
            print(f"A coluna {col} não é categórica, não será transformada.")
            raise ValueError(f"A coluna {col} não é categórica, não será transformada.")
        
    print("Colunas categóricas transformadas em numéricas")
    
    return data

def standardize_data(X_train, X_test):
    print("Normalizandos os dados com z-score")
    scaler = StandardScaler()

    if not isinstance(X_train, pd.DataFrame) or not isinstance(X_test, pd.DataFrame):
        raise ValueError("X_train e X_test devem ser DataFrames do pandas.")
    
    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("X_train e X_test devem ter o mesmo número de colunas.")
    
    if X_train.isnull().values.any() or X_test.isnull().values.any():
        raise ValueError("X_train e X_test não podem conter valores nulos.")

    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print("Dados normalizados com z-score")

    scaler_filename = 'scaler.pkl'
    with open(f"../artifacts/{scaler_filename}", 'wb') as file:
        pickle.dump(scaler, file)
    return X_train, X_test

def standardize_data_from_file(data, scaler_filename):
    print("Normalizando os dados com z-score")
    scaler = pickle.load(open(f"../artifacts/{scaler_filename}", 'rb'))

    if not isinstance(data, pd.DataFrame):
        raise ValueError("data devem ser DataFrames do pandas.")
    
    if data.isnull().values.any():
        raise ValueError("data não pode conter valores nulos.")

    data = scaler.transform(data)
    print("Dados normalizados com z-score")

    return data

def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print("Modelo salvo")

def load_model(filename):
    with open(filename, 'rb') as file:
        model = pickle.load(file)
    return model

def get_x_y(data, y_label, x_cols=None):
    print("Preparando amostras de treino e validação")
    try:
        X = data.drop(y_label, axis=1)
        X = X if not x_cols else X[x_cols]
        y = data[y_label]
    except KeyError:
        print(f"Colunas {y_label} ou {x_cols} não encontradas no DataFrame.")
        sys.exit(1)

    print("Amostras de treino e validação preparadas")

    return X, y


#### Carregamento

In [18]:
data = carrega_dados("heart_disease_dataset.csv", drop_cols=["Gender"])
data_train, data_test = train_test_split(data, test_size=0.1)

Reorganizando os dados...
Dados carregados


#### Split de treinamento

In [19]:
X, y = get_x_y(data_train, "Heart Disease")

Preparando amostras de treino e validação
Amostras de treino e validação preparadas


In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

#### Pré-processamento

In [21]:
X_train = encoding_cols(X_train, X_train.select_dtypes(include=['object']).columns.tolist())
X_val = encoding_cols(X_val, X_val.select_dtypes(include=['object']).columns.tolist())

X_train, X_val = standardize_data(X_train, X_val)

Transformando colunas categóricas em numéricas...
Colunas categóricas transformadas em numéricas
Transformando colunas categóricas em numéricas...
Colunas categóricas transformadas em numéricas
Normalizandos os dados com z-score
Dados normalizados com z-score


#### Treinamento

In [25]:
arvore = tree.DecisionTreeClassifier()
arvore.fit(X_train, y_train)
arvore

In [26]:
knn = KNeighborsClassifier(metric="euclidean", n_neighbors=3)
knn.fit(X_train, y_train)
knn

In [27]:
log_reg = LogisticRegression(solver="sag")
log_reg.fit(X_train, y_train)
log_reg

#### Validação

In [28]:
y_pred_arvore = arvore.predict(X_val)
y_pred_knn = knn.predict(X_val)
y_pred_logreg = log_reg.predict(X_val)

In [34]:
print(confusion_matrix(y_val, y_pred_arvore))
print(classification_report(y_val, y_pred_arvore))

[[120   0]
 [  0  60]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       120
           1       1.00      1.00      1.00        60

    accuracy                           1.00       180
   macro avg       1.00      1.00      1.00       180
weighted avg       1.00      1.00      1.00       180



In [35]:
print(confusion_matrix(y_val, y_pred_knn))
print(classification_report(y_val, y_pred_knn))

[[101  19]
 [ 16  44]]
              precision    recall  f1-score   support

           0       0.86      0.84      0.85       120
           1       0.70      0.73      0.72        60

    accuracy                           0.81       180
   macro avg       0.78      0.79      0.78       180
weighted avg       0.81      0.81      0.81       180



In [36]:
print(confusion_matrix(y_val, y_pred_logreg))
print(classification_report(y_val, y_pred_logreg))

[[111   9]
 [ 16  44]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       120
           1       0.83      0.73      0.78        60

    accuracy                           0.86       180
   macro avg       0.85      0.83      0.84       180
weighted avg       0.86      0.86      0.86       180



#### Comparação e análise de erros

In [41]:
data_test_transformed = encoding_cols(data_test, data_test.select_dtypes(include=['object']).columns.tolist())
data_test_transformed = data_test_transformed.drop("Heart Disease", axis=1)
data_test_transformed = standardize_data_from_file(data_test_transformed, "scaler.pkl")
y_test = data_test["Heart Disease"]

Transformando colunas categóricas em numéricas...
Colunas categóricas transformadas em numéricas
Normalizando os dados com z-score
Dados normalizados com z-score


In [43]:
y_arvore_teste = arvore.predict(data_test_transformed)
y_knn_teste = knn.predict(data_test_transformed)
y_logreg_test = log_reg.predict(data_test_transformed)

In [48]:
data_test["Respostas Arvore"] = y_arvore_teste
data_test["Respostas KNN"] = y_knn_teste
#data_test["Respostas LogReg"] = y_logreg_test

In [49]:
data_test.head(20)

Unnamed: 0,Age,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease,Respostas LogReg,Respostas Arvore,Respostas KNN
617,31,204,165,75,2,1,7,0,0,1,5,192,1,2,0,0,0,0
490,32,262,123,99,2,1,0,0,0,0,2,100,0,0,0,0,0,0
813,25,304,177,82,0,1,5,1,0,0,9,170,1,0,0,0,0,0
540,62,278,137,61,0,0,7,0,1,1,7,101,0,0,1,1,1,0
332,44,320,99,82,2,0,5,0,0,0,8,101,0,1,0,0,0,1
781,49,201,172,75,0,0,2,1,0,1,5,162,0,0,0,0,0,1
254,77,212,161,98,1,1,7,1,0,1,1,128,1,2,1,1,1,0
732,40,323,117,80,2,0,9,1,1,0,1,187,0,2,0,0,0,0
673,55,321,167,88,1,1,0,0,0,0,5,83,1,0,1,1,1,0
32,41,234,98,71,2,1,6,0,0,0,6,73,1,1,0,0,0,0


In [54]:
data_test.to_excel("../data/comparativo_modelos_heart_disease.xlsx")

In [53]:
!pip install openpyxl

Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Obtaining dependency information for et-xmlfile from https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl.metadata
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A