In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import random
from glob import glob
from tqdm import tqdm
from scipy.io import loadmat

import torch
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import os, sys
from typing import *
import torch
import random
import copy

In [2]:
def detach(
    batch_dict: Dict[int, List[torch.Tensor]] = None, 
    k_shot:int = None, 
    k_query:int = None
    ) -> Tuple[Dict[int, List[torch.Tensor]]]:
    sample_len = len(batch_dict[list(batch_dict.keys())[0]])
    
    if k_shot + k_query > sample_len:
        raise ValueError(f"Many data to unpack. Since #sample in support set: k_shot and #sample \
            in query set k_query must satisfy the condition: k_shot + k_query == #sample \
                in a batch per task.")
    elif k_shot + k_query < sample_len:
        raise UserWarning(f"the #sample in support set: k_shot and #sample in query set: k_query \
            totally are less than the #sample available in batch task dict. The redundant samples are \
                used in automatically used in query set.")
    
    support_dct = {
        _cls : batch_dict[_cls][:k_shot] for _cls in batch_dict
    }
    
    query_dct = {
        _cls : batch_dict[_cls][k_shot:] for _cls in batch_dict
    }
    
    return (support_dct, query_dct)

def maml_detach(
    batch_dict: Dict[int, List[torch.Tensor]] = None, 
    k_shot:int = None, 
    k_query:int = None,
    task:int = None
    ) -> Tuple[torch.Tensor]:
    
    support_dct, query_dct = detach(
        batch_dict=batch_dict,
        k_shot=k_shot,
        k_query=k_query
    )
    
    if not isinstance(task, int):
        raise ValueError(f"task arg must be integer type but found {type(task)} instead")
    elif task not in batch_dict.keys():
        raise Exception(f"Found no task {task} in batch dict")
    
    tasks = list(batch_dict.keys())
    
    support_x, support_y, query_x, query_y = [], [], [], []
    for _task in tasks:
        support_x.extend(support_dct[_task])
        query_x.extend(query_dct[_task])
        if _task == task:
            support_y.extend([1]*k_shot)
            query_y.extend([1]*k_query)
        else:
            support_y.extend([0]*k_shot)
            query_y.extend([0]*k_query)
    
    support_x = torch.stack(support_x)
    support_y = torch.FloatTensor(support_y)
    query_x = torch.stack(query_x)
    query_y = torch.FloatTensor(query_y)
    
    return (support_x, support_y, query_x, query_y)

def single_task_detach(
    batch_dict: Dict[int, List[torch.Tensor]] = None, 
    k_shot:int = None, 
    k_query:int = None,
    task:int = None
    ):
    
    support_dct, query_dct = detach(
        batch_dict=batch_dict,
        k_shot=k_shot,
        k_query=k_query
    )
    
    if not isinstance(task, int):
        raise ValueError(f"task arg must be integer type but found {type(task)} instead")
    elif task not in batch_dict.keys():
        raise Exception(f"Found no task {task} in batch dict")
    
    support_x, support_y, query_x, query_y = [], [], [], []
    
    support_x.extend(support_dct[task])
    support_y.extend([task]*len(support_dct[task]))
    query_x.extend(query_dct[task])
    query_y.extend([task]*len(query_dct[task]))
    
    support_x = torch.stack(support_x)
    support_y = torch.LongTensor(support_y)
    query_x = torch.stack(query_x)
    query_y = torch.LongTensor(query_y)
    
    return (support_x, support_y, query_x, query_y)


In [3]:
print(os.getcwd())

for i in range (3):
    os.chdir("..")
    
main_data_dir = os.getcwd() + "/Data set"

/home/thaobeo/git/HeartResearch/Experiment/Approach/Model_signal


In [4]:
data_dir = "/media/mountHDD2/khoibaocon"
print(os.listdir(data_dir))

['TrainingSet3', 'Label.csv', 'alldata', 'TrainingSet1', 'single_label.csv', 'TrainingSet2']


In [5]:
main_df = pd.read_csv(data_dir + "/Label.csv")
main_df.shape

(6877, 4)

In [6]:
main_df.head()

Unnamed: 0,Recording,First_label,Second_label,Third_label
0,A0001,5,,
1,A0002,1,,
2,A0003,2,,
3,A0004,2,,
4,A0005,7,,


In [7]:
main_df["First_label"].value_counts()

First_label
5    1695
2    1098
1     918
8     826
3     704
7     653
6     574
4     207
9     202
Name: count, dtype: int64

In [8]:
main_df["Second_label"].value_counts()

Second_label
5.0    162
2.0    123
7.0     47
6.0     42
8.0     41
4.0     28
3.0     18
9.0     16
Name: count, dtype: int64

In [9]:
main_df["Third_label"].value_counts()

Third_label
9.0    2
8.0    2
6.0    1
4.0    1
Name: count, dtype: int64

In [10]:
single_main_df = main_df[main_df["Second_label"].isnull()]
single_main_df.shape

(6400, 4)

In [11]:
single_main_df.to_csv(main_data_dir + "/single_label.csv")

In [12]:
mat_files = glob(data_dir + "/alldata/*")
print(len(mat_files))

6877


In [13]:
single_fns = single_main_df["Recording"].values.tolist()
print(len(single_fns))

6400


In [14]:
single_mat_paths = [data_dir + f"/alldata/{x}.mat" for x in single_fns]
print(os.path.exists(single_mat_paths[0]))

True


In [15]:
sample_data = loadmat(single_mat_paths[0])
sample_data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'ECG'])

In [16]:
sample_signal_data = sample_data['ECG'][0][0][2]
sample_signal_data.shape

(12, 7500)

In [17]:
print(len(single_mat_paths))

6400


In [18]:
# plt.plot(sample_signal_data[0])

In [19]:
sample_sig = torch.randn(1, 12, 32)
conv_test = nn.Conv1d(12, 12, 3, 1, 1)
print(conv_test(sample_sig).shape)

torch.Size([1, 12, 32])


In [20]:
len_lst = [loadmat(x)['ECG'][0][0][2].shape[1] for x in single_mat_paths]
print(f"MAX: {max(len_lst)}")
print(f"MIN: {min(len_lst)}")
print(f"AVG: {sum(len_lst)/len(len_lst)}")

MAX: 72000
MIN: 3000
AVG: 7946.03703125


In [21]:
class BasicBlock(nn.Module):
    def __init__(self, channel_num):
        super(BasicBlock, self).__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(channel_num, channel_num, 3, padding=1),
            nn.BatchNorm1d(channel_num),
            nn.ReLU())
        
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(channel_num, channel_num, 3, padding=1),
            nn.BatchNorm1d(channel_num),
        )
        self.relu = nn.ReLU()
        torch.nn.init.kaiming_normal_(self.conv_block1[0].weight)
        torch.nn.init.kaiming_normal_(self.conv_block2[0].weight)
        
    def forward(self, x):
        residual = x
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = x + residual
        out = self.relu(x)
        return out

In [22]:
test_basic_block = BasicBlock(2)
sample_sig = torch.randn(1, 2, 32)
print(test_basic_block(sample_sig).shape)

torch.Size([1, 2, 32])


In [23]:
class ResNet(nn.Module):
    def __init__(self, in_channels = 12, type = 18, num_classes = 9):
        super(ResNet, self).__init__()
        self.struc_dict = {
            18: {
                "num_channels" : [64, 128, 256, 512],
                "counts" : [2, 2, 2, 2]
            }
        }
        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=64, kernel_size=7, stride=2)
        torch.nn.init.kaiming_normal_(self.conv1.weight)
        self.max1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.main = nn.Sequential()
        for idx, struc in enumerate(
            zip(
                self.struc_dict[type]["num_channels"], 
                self.struc_dict[type]["counts"]
            )
        ):
            num_channel, cnt = struc
            for i in range(cnt):
                self.main.add_module(f"conv{idx+1}_{i}", BasicBlock(num_channel))
            if idx < len(self.struc_dict[type]["num_channels"]) - 1:
                self.main.add_module(f"ext_{idx}", nn.Conv1d(num_channel, self.struc_dict[type]["num_channels"][idx+1], 3, 1))
                self.main.add_module(f"extbn_{idx}", nn.BatchNorm1d(self.struc_dict[type]["num_channels"][idx+1]))
                                     
        self.avg = torch.nn.AdaptiveAvgPool1d((1))
        self.lin = nn.Linear(self.struc_dict[type]["num_channels"][-1], num_classes)
        torch.nn.init.kaiming_normal_(self.lin.weight)
    def forward(self, x):
        x = self.conv1(x)
        x = self.max1(x)
        x = self.main(x)
        x = self.avg(x)
        x = x.reshape(x.shape[0], -1)
        x = self.lin(x)
        return x

In [24]:
model = ResNet()
sample_sig = torch.randn(1, 12, 3000)
model(sample_sig).shape

torch.Size([1, 9])

In [25]:
# class ECG(Dataset):
#     def __init__(self, data_paths, label_df):
#         self.data_paths = data_paths
#         random.shuffle(self.data_paths)
#         self.label_df = label_df

#     def __getitem__(self, idx):
#         data_path = self.data_paths[idx]        
#         data = loadmat(data_path)['ECG'][0][0][2]
#         clip_data = data[:, 300:3000]

#         filename = data_path.split("/")[-1].split(".")[0]
#         label = self.label_df[self.label_df["Recording"] == filename]["First_label"].values.item()

#         torch_data = torch.from_numpy(clip_data)

#         return torch_data.float(), label-1

#     def __len__(self):
#         return len(self.data_paths)

In [26]:
ratio = [0.8, 0.1, 0.1]

train_index = int(len(single_mat_paths)*ratio[0])
valid_index = int(len(single_mat_paths)*(ratio[0]+ratio[1]))

train_image_paths = single_mat_paths[:train_index]
valid_image_paths = single_mat_paths[train_index:valid_index]
test_image_paths = single_mat_paths[valid_index:]

In [27]:
sig_dict = {i : [] for i in range(9)} 
print(sig_dict)
sig_dict[1].append(1)

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: []}


In [28]:
max_sample = max([len(sig_dict[i]) for i in range(9)])
print(max_sample)

1


In [29]:
sig_dict[1] = sig_dict[1] + random.sample(sig_dict[1], k = 1)

In [30]:
print(sig_dict)

{0: [], 1: [1, 1], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: []}


In [31]:
ks = 32
kq = 32

def set_dataset(mat_path):
    sig_dict = {i : [] for i in range(9)}    

    for data_path in mat_path:
        filename = data_path.split("/")[-1].split(".")[0]
        _cls = single_main_df[single_main_df["Recording"] == filename]["First_label"].values.item()
        data = loadmat(data_path)['ECG'][0][0][2]
        clip_data = data[:, 300:3000]
        torch_data = torch.from_numpy(clip_data)

        sig_dict[_cls-1].append(clip_data)

    max_sample = max([len(sig_dict[i]) for i in range(9)])
    sample_cls_cnt = max_sample + ((ks + kq) - max_sample % (ks + kq))
    
    for i in range(9):

        a = sample_cls_cnt/len(sig_dict[i])

        if a < 1:
            sig_dict[i] = sig_dict[i] + random.sample(sig_dict[i], k = sample_cls_cnt - len(sig_dict[i]))
        else:
            sig_dict[i] = (int(np.floor(sample_cls_cnt/len(sig_dict[i]))))*sig_dict[i]
            sig_dict[i] = sig_dict[i] + random.sample(sig_dict[i], k = sample_cls_cnt - len(sig_dict[i]))
            
    return sig_dict

In [32]:
class HeartData(Dataset):
    def __init__(self, data_path):
        self.data_path = data_path

    def __len__(self):
        return len(self.data_path)
        
    def __getitem__(self, index):
        filename = self.data_path.split("/")[-1].split(".")[0]
        label = single_main_df[single_main_df["Recording"] == filename]["First_label"].values.item()
        data = loadmat(data_path)['ECG'][0][0][2]
        signal = data[:, 300:3000]
        
        return signal, label

In [33]:
train_dataset = set_dataset(train_image_paths)
# valid_dataset = HearData(valid_image_paths)

In [34]:
valid_dataset = HeartData(valid_image_paths)

In [35]:
class ECG(Dataset):
    def __init__(self, dict_ds):
        self.dict_ds = dict_ds

    def __getitem__(self, idx):
        selected_dict = { _cls : self.dict_ds[_cls][idx] for _cls in range(9)}

        return selected_dict

    def __len__(self):
        
        return len(self.dict_ds[1])

In [36]:
# check_ds = ECG(data_paths=single_mat_paths, label_df=single_main_df)
# sample, lbl = check_ds[0]
# print(sample.shape, lbl)

In [37]:
# model(sample.unsqueeze(dim=0)).shape

In [38]:
# data_dict = {
#     idx : [] for idx in range(9)
# }

# for data_path in single_mat_paths:
#     filename = data_path.split("/")[-1].split(".")[0]
#     _cls = single_main_df[single_main_df["Recording"] == filename]["First_label"].values.item()

#     data_dict[_cls-1].append(data_path)

# for key in data_dict:
#     print(f"{key}->{len(data_dict[key])}")

In [39]:
# train_data_dict = {
#     _cls : data_dict[_cls][:int(0.9*len(data_dict[_cls]))] for _cls in data_dict
# }

# valid_data_dict = {
#     _cls : data_dict[_cls][int(0.9*len(data_dict[_cls])):] for _cls in data_dict
# }

# for key in train_data_dict:
#     print(f"{key}->{len(train_data_dict[key])}--{len(valid_data_dict[key])}")

In [40]:
# train_data_paths = []
# for key in train_data_dict:
#     train_data_paths.extend(train_data_dict[key])
# valid_data_paths = []
# for key in valid_data_dict:
#     valid_data_paths.extend(valid_data_dict[key])
# print(len(train_data_paths))
# print(len(valid_data_paths))

In [41]:
# sig_dict = {
#     idx : [] for idx in range(9)
# }    

# for data_path in single_mat_paths:
#     filename = data_path.split("/")[-1].split(".")[0]
#     _cls = single_main_df[single_main_df["Recording"] == filename]["First_label"].values.item()
#     data = loadmat(data_path)['ECG'][0][0][2]
#     sig_dict[_cls-1].append(data)

In [42]:
# print(sig_dict[0][0].shape)

In [43]:
train_ds = ECG(train_dataset)
# valid_ds = ECG(valid_dataset)

print(len(train_ds))
# print(len(valid_ds))

1280


In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index = 1)
batch_size = 64

traindl = DataLoader(
    train_ds,
    batch_size=batch_size, 
    shuffle=True, 
    pin_memory=True, 
    num_workers=os.cpu_count()//2
)

validdl = DataLoader(
    valid_dataset,
    batch_size=1, 
    shuffle=True, 
    pin_memory=True, 
    num_workers=os.cpu_count()//2
)

print(len(traindl))
print(len(validdl))

20
640


In [46]:
epoch = 100
lr = 0.0001
inner_epoch = 100

model.to(device)
optimizer = Adam(model.parameters(), lr=lr)
scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=epoch)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for e in range(epoch):
    model.train()
    print(f"Epoch: {e}")
    batch_cnt = 0
    total_loss = 0
    correct = 0
    for train_idx, train_data_dict in enumerate(traindl):
        metaloss = 0
        for task in train_data_dict:
            task_model = copy.deepcopy(model)
            task_optimizer = Adam(task_model.parameters(), lr=lr, weight_decay=1e-4)
            sp_x, sp_y, qr_x, qr_y = single_task_detach(batch_dict=train_data_dict,
                                                        k_shot=32,
                                                        k_query=32,
                                                        task=task)
        
            for in_e in range (inner_epoch):
                sp_x, sp_y = sp_x.to(device, dtype = torch.float), sp_y.to(device)
                sp_logits = task_model(sp_x)
                sp_loss = loss_fn(sp_logits, sp_y)
                task_optimizer.zero_grad()
                sp_loss.backward()
                task_optimizer.step() 
                
            qr_x, qr_y = qr_x.to(device, dtype = torch.float), qr_y.to(device)
            qr_logits = task_model(qr_x)
            qr_loss = loss_fn(qr_logits, qr_y)
            metaloss += qr_loss.item()
            qr_loss.backward()            

            for w_global, w_local in zip(model.parameters(), task_model.parameters()):
                if w_global.grad is None:
                    w_global.grad = w_local.grad
                else:
                    w_global.grad += w_local.grad  
        optimizer.step()
        optimizer.zero_grad()            

    model.eval()
    with torch.no_grad():
        test_loss = 0
        correct = 0
        total = 0
        batch_count = 0
        for test_idx, (test_sigs, test_labels) in enumerate(validdl):
            batch_count = test_idx
            test_sigs = test_sigs.to(device, dtype = torch.float)
            test_labels = test_labels.to(device)
            test_logits = model(test_sigs)                

            test_loss += loss_fn(test_logits, test_labels).item()
            _, predicted = test_logits.max(1)
            total += test_labels.size(0)
            correct += predicted.eq(test_labels).sum().item()

    print(f"Epoch: {epoch} - MetaLoss: {metaloss/num_task} - Test Loss: {test_loss/batch_count} - Test Acc: {100*correct/total}%") 
    

Epoch: 0


In [None]:
#         batch_cnt = batch
#         train_sig = train_sig.to(device)
#         train_label = train_label.to(device)
        
#         pred = model(train_sig)
#         loss = loss_fn(pred, train_label)
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         scheduler.step()
        
#         total_loss += loss.item()
#         correct += (pred.argmax(1) == train_label).type(torch.float).sum().item()
    
#     total_loss /= batch_cnt
#     correct /= len(traindl.dataset)
    
#     print(f"train loss: {total_loss} - train acc: {100*correct}")
    
#     batch_cnt = 0
#     val_total_loss = 0
#     val_correct = 0
#     model.eval()
#     with torch.no_grad():
#         for batch, (valid_sig, valid_label) in tqdm(enumerate(validdl)):
#             batch_cnt = batch
#             valid_sig = valid_sig.to(device)
#             valid_label = valid_label.to(device)
            
#             pred = model(valid_sig)
#             loss = loss_fn(pred, valid_label)
            
#             val_total_loss += loss.item()
#             val_correct += (pred.argmax(1) == valid_label).type(torch.float).sum().item()
    
#         val_total_loss /= batch_cnt
#         val_correct /= len(validdl.dataset)
        
#         print(f"valid loss: {val_total_loss} - valid acc: {100*val_correct}")