In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix

import torch
from torch import nn
from torch.utils.data import DataLoader,TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns



PyTorch Lightning, é um framework leve para treinamento que visa simplificar e acelerar o processo de treinamento.

In [2]:
# Configuração para garantir a reprodutibilidade dos resultados
SEED = 2
# Definem a semente aleatória para as bibliotecas NumPy e PyTorch
np.random.seed(SEED)
torch.manual_seed(SEED) # CPU
torch.cuda.manual_seed(SEED) # GPU
torch.cuda.manual_seed_all(SEED) # GPUs

# Configuração para garantir que a biblioteca cuDNN do PyTorch gere resultados determinísticos (usado para aceleração em GPU)
torch.backends.cudnn.deterministic = True

# Verifica se há uma GPU disponível e define o dispositivo para "cuda" (GPU) ou "cpu" (CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Carrega o Dataset
dataset = pd.read_csv("Datasets\dataset_SP_Obitos_Scaler.csv")
dataset

Unnamed: 0,CS_SEXO,NU_IDADE_N,CS_GESTANT,NOSOCOMIAL,FEBRE,TOSSE,GARGANTA,DISPNEIA,DESC_RESP,SATURACAO,...,TOMO_RES,RES_AN,VACINA_COV,QTD_DIAS,DIAS_INTERNA,SINT_ATE_NOTIF,PCR_EVOLUCAO,DIAS_DOSE2,DIAS_DOSE1,EVOLUCAO
0,1.016317,-1.525360,-0.841847,-0.247347,-0.681876,-0.533104,-0.318904,0.130633,-0.028625,0.090063,...,0.612697,0.575506,-0.370379,-0.448566,-0.606642,-0.333757,-0.245809,-0.035479,-0.069846,0
1,-0.983945,0.288959,0.663434,-0.247347,2.634754,3.009587,-0.940007,2.830029,2.417626,-0.566145,...,0.612697,-1.068704,-0.370379,0.265732,0.567583,-0.333757,-0.505612,-0.007870,-0.016540,0
2,1.016317,-0.618201,-0.841847,-0.247347,-0.018550,-0.533104,-0.940007,-0.544216,-0.640188,0.090063,...,0.612697,1.123576,-0.370379,-0.364531,-0.537570,-0.292220,-0.505612,-0.029875,-0.043009,0
3,-0.983945,-0.247090,0.663434,-0.247347,-0.681876,-0.533104,-0.318904,-0.544216,-0.640188,-0.566145,...,0.612697,1.671646,1.135630,0.307750,0.636655,0.123142,0.966608,-0.043817,-0.121681,1
4,1.016317,-0.906842,-0.841847,-0.247347,-0.018550,-0.533104,-0.940007,0.130633,-0.028625,0.090063,...,-1.847184,0.575506,-0.370379,-0.364531,-0.468498,-0.416829,-0.505612,-0.027962,-0.048156,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25205,-0.983945,0.412663,0.663434,-0.247347,-0.018550,-0.533104,-0.940007,-0.544216,-0.640188,-0.566145,...,-1.847184,0.575506,-0.370379,-0.406549,-0.606642,0.040070,-0.505612,0.020014,0.057720,1
25206,1.016317,0.825008,-0.841847,-0.247347,-0.681876,0.175434,-0.318904,-0.544216,-0.640188,-0.566145,...,-1.355208,-1.068704,-0.370379,-0.196461,-0.330354,0.123142,-0.505612,0.019604,0.059190,1
25207,1.016317,1.072415,-0.841847,-0.247347,-0.018550,-0.533104,-0.318904,-0.544216,-0.640188,-0.566145,...,-1.355208,-1.068704,-0.370379,-0.490584,1.396447,0.870795,-0.505612,0.025208,0.069484,1
25208,1.016317,0.866242,0.663434,-0.247347,-0.018550,-0.533104,-0.318904,-0.544216,-0.028625,0.090063,...,-1.355208,-1.068704,-0.370379,-0.406549,-0.537570,-0.209148,-0.245809,0.000058,0.008091,1


Divisão do dataset em três conjuntos distintos: treinamento, validação e teste. Sendo 60% para treinamento, 20% para validação e 20% para teste.


In [4]:
# Cria um array para armazenar os índices do dataset original. Para embaralhar os índices e, posteriormente dividir os dados aleatoriamente.
index = np.array(dataset.index)

# Embaralha os índices de forma aleatória
np.random.shuffle(index)
# Número total de amostras no dataset
n = len(index)

# Seleciona os índices das primeiras 60% amostras embaralhadas para o conjunto de treinamento.
train_index = index[0:int(0.6*n)]
# As amostras da posição 60% até a posição 80% para o conjunto de validação
valid_index = index[int(0.6*n):int(0.8*n)]
# As amostras da posição 80% até o final para o conjunto de teste
test_index = index[int(0.8*n):]

# Cria um dataset para armazenar cada conjunto, treino, validação e teste, respectivamente e reindexa os índices
train_dset = dataset.loc[train_index].reset_index(drop=True)
valid_dset = dataset.loc[valid_index].reset_index(drop=True)
test_dset = dataset.loc[test_index].reset_index(drop=True)

In [5]:
# Obtendo os atributos (features) do dataset, excluindo a coluna 'EVOLUCAO'
input_features = dataset.columns.drop('EVOLUCAO').tolist()

# Obtendo os rótulos (target) do dataset
target = 'EVOLUCAO'
labels = dataset[target].tolist()

In [6]:
# Os dados são convertidos em tensores PyTorch
train_tensor_dset = TensorDataset(
    # converte os dados das colunas de entradas em tensores de ponto flutuante
    torch.tensor(train_dset[input_features].values, dtype=torch.float),
    # converte os dados da coluna de rótulos em tensores de ponto flutuante com formato de matriz
    torch.tensor(train_dset[target].values.reshape(-1,1), dtype=torch.float)
)

valid_tensor_dset = TensorDataset(
    torch.tensor(valid_dset[input_features].values, dtype=torch.float),
    torch.tensor(valid_dset[target].values.reshape(-1,1), dtype=torch.float)
)

test_tensor_dset = TensorDataset(
    torch.tensor(test_dset[input_features].values, dtype=torch.float),
    torch.tensor(test_dset[target].values.reshape(-1,1), dtype=torch.float) 
)

No contexto da biblioteca PyTorch, um tensor é uma estrutura de dados similar a um array NumPy, mas com suporte para aceleração de hardware através de GPUs.

A abordagem com tensores no PyTorch permite representar e manipular os dados de entrada e saída do modelo de forma eficiente, além de oferecer suporte para aceleração de hardware em GPUs, o que é essencial para realizar cálculos complexos em redes neurais profundas com grandes volumes de dados.

In [7]:
class CNN_1D_Khan(pl.LightningModule):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        # Camadas da arquitetura do artigo
        self.conv1 = nn.Conv1d(input_dim, 600, kernel_size=10, stride=1, padding=0)
        self.conv2 = nn.Conv1d(600, 120, kernel_size=10, stride=1, padding=0)
        self.conv3 = nn.Conv1d(120, 60, kernel_size=10, stride=1, padding=0)
        self.fc1 = nn.Linear(60, 200)
        self.fc2 = nn.Linear(200, output_dim)

        self.loss = nn.BCEWithLogitsLoss()


    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.relu(self.conv3(x))
        x = x.max(dim=2)[0]  # Global Max Pooling
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    # Métricas
    def roc_auc(self, y_pred, y_true):
        # Calcula o ROC AUC usando sklearn.metrics.roc_auc_score
        roc_auc = roc_auc_score(y_true.cpu(), torch.sigmoid(y_pred).cpu())
        return torch.tensor(roc_auc)

    def accuracy(self, y_pred, y_true):
        # Arredonda as probabilidades previstas para obter as previsões binárias (0 ou 1)
        y_pred_labels = torch.round(torch.sigmoid(y_pred))
        # Calcula a acurácia usando sklearn.metrics.accuracy_score
        acc = accuracy_score(y_true.cpu(), y_pred_labels.cpu())
        return torch.tensor(acc)

    def f1(self, y_pred, y_true):
        # Arredonda as probabilidades previstas para obter as previsões binárias (0 ou 1)
        y_pred_labels = torch.round(torch.sigmoid(y_pred))
        # Calcula o F1-score usando sklearn.metrics.f1_score
        f1score = f1_score(y_true.cpu(), y_pred_labels.cpu())
        return torch.tensor(f1score)
    
    def calculate_confusion_matrix(self, y_pred, y_true):
        # Arredonda as probabilidades previstas para obter as previsões binárias (0 ou 1)
        y_pred_labels = torch.round(torch.sigmoid(y_pred))
        # Calcula a matriz de confusão usando as previsões e os rótulos verdadeiros
        cm = confusion_matrix(y_true.cpu(), y_pred_labels.cpu())
        return cm
    
    def plot_confusion_matrix(self, cm):
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['0', '1'], yticklabels=['0', '1'])
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.title("Confusion Matrix")
        plt.show()
        

    # define os passos de treinamento do modelo.
    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        
        # Calcular métricas durante a validação
        valid_auc = self.roc_auc(y_hat, y)
        valid_acc = self.accuracy(y_hat, y)
        valid_f1 = self.f1(y_hat, y)
        
        self.log('valid_loss', loss)
        self.log('valid_auc', valid_auc, prog_bar=True)
        self.log('valid_acc', valid_acc, prog_bar=True)
        self.log('valid_f1', valid_f1, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        X, y = batch
        y_logit = self.forward(X)
        y_probs = torch.sigmoid(y_logit).detach().cpu().numpy()
        loss = self.loss(y_logit, y)

        # Calcular métricas
        roc_auc = self.roc_auc(y_logit, y)
        acc = self.accuracy(y_logit, y)
        f1score = self.f1(y_logit, y)
        cm = self.calculate_confusion_matrix(y_logit, y)
        
        self.plot_confusion_matrix(cm) # Plotar e salvar a matriz de confusão
        self.log('test_loss', loss)
        self.log('test_auc', roc_auc)
        self.log('test_acc', acc, prog_bar=True)
        self.log('test_f1', f1score, prog_bar=True)

    # ADAM
    def configure_optimizers(self):
    # Cria um otimizador Adam
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer, 
                mode="min", 
                factor=0.5, 
                patience=5, 
                min_lr=1e-6),
            'interval': 'epoch',
            'frequency': 1,
            'reduce_on_plateau': True,
            'monitor': 'valid_loss',
        }
        return [optimizer], [scheduler]



In [8]:
model = CNN_1D_Khan(
    input_dim=len(input_features), 
    output_dim=1
)
print("\nNúmero de atributos (input_dim):", len(input_features))

# interromper o treinamento prematuramente se a métrica de validação não melhorar, após x épocas
early_stop_callback = EarlyStopping(
   monitor='valid_loss',
   min_delta=.0,
   patience=21,
   verbose=True,
   mode='min'
)

trainer = pl.Trainer(
    callbacks=[early_stop_callback],
    min_epochs=10, 
    max_epochs=200, 
    accelerator='gpu') # gpus=1

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs



Número de atributos (input_dim): 40




In [9]:
trainer.fit(
    model, 
    DataLoader(train_tensor_dset, batch_size=64, shuffle=True, num_workers=4),
    DataLoader(valid_tensor_dset, batch_size=64, shuffle=False, num_workers=4)
)

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type              | Params
--------------------------------------------
0 | conv1 | Conv1d            | 240 K 
1 | conv2 | Conv1d            | 720 K 
2 | conv3 | Conv1d            | 72.1 K
3 | fc1   | Linear            | 12.2 K
4 | fc2   | Linear            | 201   
5 | loss  | BCEWithLogitsLoss | 0     
--------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.181     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

RuntimeError: Given groups=1, weight of size [600, 40, 10], expected input[1, 64, 40] to have 40 channels, but got 64 channels instead

In [None]:
# Validation dataset
trainer.test(model, DataLoader(valid_tensor_dset, batch_size=64, shuffle=False, num_workers=4))

In [None]:
# Test dataset
trainer.test(model, DataLoader(test_tensor_dset, batch_size=2048, shuffle=False, num_workers=4))