In [95]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import random
from glob import glob
from tqdm import tqdm
from scipy.io import loadmat

In [73]:
data_dir = "/media/mountHDD2/khoibaocon"
print(os.listdir(data_dir))

['TrainingSet3', 'Label.csv', 'alldata', 'TrainingSet1', 'single_label.csv', 'TrainingSet2']


In [None]:
main_df = pd.read_csv(data_dir + "/Label.csv")

In [124]:
class CustomModel(nn.Module):
    def __init__(self, num_classes):
        super(CustomModel, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=24, stride=2, padding=12)
        self.conv4 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv6 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=24, stride=2, padding=12)
        self.conv7 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv8 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv9 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=24, stride=2, padding=12)
        self.conv10 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv11 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv12 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=24, stride=2, padding=12)
        self.conv13 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv14 = nn.Conv1d(in_channels=12, out_channels=12, kernel_size=3, padding=1)
        self.conv15 = nn.Conv1d(in_channels=12, out_channels=48, kernel_size=24, stride=2, padding=12)
        
        self.biGRU = nn.GRU(input_size=12, hidden_size=12, batch_first=True, bidirectional=True)
#        self.attention = AttentionWithContext()
        self.batchnorm = nn.BatchNorm1d(2250)
        self.dense = nn.Linear(95, num_classes)
        
    def forward(self, x):
        x = nn.LeakyReLU()(self.conv1(x))
        x = nn.LeakyReLU()(self.conv2(x))
        x = nn.LeakyReLU()(self.conv3(x))
        x = nn.Dropout(p=0.2)(x)
        x = nn.LeakyReLU()(self.conv4(x))
        x = nn.LeakyReLU()(self.conv5(x))
        x = nn.LeakyReLU()(self.conv6(x))
        x = nn.Dropout(p=0.2)(x)
        x = nn.LeakyReLU()(self.conv7(x))
        x = nn.LeakyReLU()(self.conv8(x))
        x = nn.LeakyReLU()(self.conv9(x))
        x = nn.Dropout(p=0.2)(x)
        x = nn.LeakyReLU()(self.conv10(x))
        x = nn.LeakyReLU()(self.conv11(x))
        x = nn.LeakyReLU()(self.conv12(x))
        x = nn.Dropout(p=0.2)(x)
        x = nn.LeakyReLU()(self.conv13(x))
        x = nn.LeakyReLU()(self.conv14(x))
        x = nn.LeakyReLU()(self.conv15(x))
        x = nn.Dropout(p=0.2)(x)
        
#        x, _ = self.biGRU(cnnout)
        x = nn.LeakyReLU()(x)
        x = nn.Dropout(p=0.2)(x)
#        x = self.attention(x)
#        x = self.batchnorm(x)
        x = nn.LeakyReLU()(x)
        x = nn.Dropout(p=0.2)(x)
        main_output = self.dense(x)
        
        return torch.sigmoid(main_output)
    
model = CustomModel(num_classes=9)

In [125]:
x = torch.randn(32, 12, 3000)
x = model(x)

In [76]:
# List vấn đề tồn đọng trong file train và hướng giải quyết kèm lý do

In [79]:
single_main_df = pd.read_csv(data_dir + "/single_label.csv")
single_fns = single_main_df["Recording"].values.tolist()
print(len(single_fns))
single_mat_paths = [data_dir + f"/alldata/{x}.mat" for x in single_fns]

6400


In [85]:
class ECG(Dataset):
    def __init__(self, data_paths, label_df):
        self.data_paths = data_paths
        random.shuffle(self.data_paths)
        self.label_df = label_df

    def __getitem__(self, idx):
        data_path = self.data_paths[idx]        
        data = loadmat(data_path)['ECG'][0][0][2]
        clip_data = data[:, 300:3000]

        filename = data_path.split("/")[-1].split(".")[0]
        label = self.label_df[self.label_df["Recording"] == filename]["First_label"].values.item()

        torch_data = torch.from_numpy(clip_data)

        return torch_data.float(), label-1

    def __len__(self):
        return len(self.data_paths)

In [87]:
data_dict = {
    idx : [] for idx in range(9)
}

for data_path in single_mat_paths:
    filename = data_path.split("/")[-1].split(".")[0]
    _cls = single_main_df[single_main_df["Recording"] == filename]["First_label"].values.item()

    data_dict[_cls-1].append(data_path)

for key in data_dict:
    print(f"{key}->{len(data_dict[key])}")

0->918
1->976
2->686
3->179
4->1533
5->532
6->607
7->784
8->185


In [88]:
train_data_dict = {
    _cls : data_dict[_cls][:int(0.9*len(data_dict[_cls]))] for _cls in data_dict
}

valid_data_dict = {
    _cls : data_dict[_cls][int(0.9*len(data_dict[_cls])):] for _cls in data_dict
}

for key in train_data_dict:
    print(f"{key}->{len(train_data_dict[key])}--{len(valid_data_dict[key])}")

0->826--92
1->878--98
2->617--69
3->161--18
4->1379--154
5->478--54
6->546--61
7->705--79
8->166--19


In [89]:
train_data_paths = []
for key in train_data_dict:
    train_data_paths.extend(train_data_dict[key])
valid_data_paths = []
for key in valid_data_dict:
    valid_data_paths.extend(valid_data_dict[key])
print(len(train_data_paths))
print(len(valid_data_paths))

5756
644


In [91]:
train_ds = ECG(train_data_paths, single_main_df)
valid_ds = ECG(valid_data_paths, single_main_df)

traindl = DataLoader(
    train_ds,
    batch_size=32, 
    shuffle=True, 
    pin_memory=True, 
    num_workers=os.cpu_count()//2
)

validdl = DataLoader(
    valid_ds,
    batch_size=1, 
    shuffle=True, 
    pin_memory=True, 
    num_workers=os.cpu_count()//2
)

In [120]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index = 1)
epoch = 100
lr = 0.0001

model.to(device)
optimizer = Adam(model.parameters(), lr=lr)
scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=epoch)
loss_fn = nn.CrossEntropyLoss()

In [111]:
for e in range(epoch):
    model.train()
    print(f"Epoch: {e}")
    batch_cnt = 0
    total_loss = 0
    correct = 0
    for batch, (train_sig, train_label) in tqdm(enumerate(traindl)):
        batch_cnt = batch
        train_sig = train_sig.to(device)
        train_label = train_label.to(device)
        
        pred = model(train_sig)
        print
        loss = loss_fn(pred, train_label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        scheduler.step()
        
        total_loss += loss.item()
        correct += (pred.argmax(1) == train_label).type(torch.float).sum().item()
    
    total_loss /= batch_cnt
    correct /= len(traindl.dataset)
    
    print(f"train loss: {total_loss} - train acc: {100*correct}")
    
    batch_cnt = 0
    val_total_loss = 0
    val_correct = 0
    model.eval()
    with torch.no_grad():
        for batch, (valid_sig, valid_label) in tqdm(enumerate(validdl)):
            batch_cnt = batch
            valid_sig = valid_sig.to(device)
            valid_label = valid_label.to(device)
            
            pred = model(valid_sig)
            loss = loss_fn(pred, valid_label)
            
            val_total_loss += loss.item()
            val_correct += (pred.argmax(1) == valid_label).type(torch.float).sum().item()
    
        val_total_loss /= batch_cnt
        val_correct /= len(validdl.dataset)
        
        print(f"valid loss: {val_total_loss} - valid acc: {100*val_correct}")

Epoch: 0


0it [00:00, ?it/s]


RuntimeError: Expected target size [32, 9], got [32]