### Load package

In [1]:
import glob
import json
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from joblib import Parallel, delayed
from sklearn.metrics import r2_score, mean_squared_error
from torch.cuda import device
from torch.onnx.symbolic_opset11 import unsqueeze
from torch.utils.data import DataLoader, random_split

In [2]:
N_DATA = [4,6]
OFFSET = None
DEBUG = False
MODEL_TYPES = [1]  # , 2, 3, 4, 11, 12, 13, 14]#, 15, 16]
NUM_EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 0.001
PLOT = False
SAVE_CAM = True  # Save CAM images
CAM_DIR = 'cam_images'  # Directory to save CAM images
os.makedirs(CAM_DIR, exist_ok=True)
dtype = 1

##### analysis.py load_and_preprocess_data

In [3]:
from data_source_utils import load_dataset, process_dataset
from preprocess_utils import preprocess_dataset
from analysis import load_json_file

In [4]:
data_list_path = "F:\homes\data_list.txt"

chunk_file_name_list = []
dataset_name = []

with open(data_list_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 각 줄을 공백으로 분리
        parts = line.strip().split()
        if len(parts) == 2:  # 앞부분과 뒷부분이 존재하는 경우
            chunk_file_name_list.append(parts[0])  # 앞부분 저장
            dataset_name.append(int(parts[1]))  # 뒷부분 저장

# n_data에서 dataset_name과 중복된 항목 제거
n_data = [n for n in N_DATA if n not in dataset_name]

preprocessed_dataset = []
dataset = load_dataset(offset=OFFSET, n_data=n_data)
if len(dataset) > 0:
    for data in dataset:
        preprocessed_dataset.append(preprocess_dataset(data['data'], dtype, data['fs'], plot=PLOT, debug=DEBUG))

In [5]:
all_files = []
for chunk_file_name in chunk_file_name_list:
    file_pattern = "F:/homes/preprocessed_data/"+ str(dataset_name)+"/preprocessed_data_" + chunk_file_name + "_*.json"
    file_list = glob.glob(file_pattern)
    all_files.extend(file_list)  # 모든 파일을 리스트에 추가

In [6]:
# all_files

In [7]:
# 병렬로 파일 로드
if all_files:
    loaded_data = Parallel(n_jobs=-1)(delayed(load_json_file)(file_name) for file_name in all_files)

    # 병렬 로드된 데이터를 하나의 리스트로 확장
    for data in loaded_data:
        preprocessed_dataset.extend(data)

In [8]:
len(preprocessed_dataset)

66989

In [9]:
def get_min_max_data_length(preprocessed_dataset):
    # 'data' 필드의 길이를 저장할 리스트
    data_lengths = [len(item['data']) for item in preprocessed_dataset if 'data' in item]

    if data_lengths:
        min_length = min(data_lengths)
        max_length = max(data_lengths)
        return min_length, max_length
    else:
        raise ValueError("The dataset does not contain 'data' field or is empty.")

In [10]:
preprocessed_dataset[0]

{'data': [[81, 255, 0],
  [16, 255, 0],
  [38, 255, 0],
  [29, 255, 0],
  [42, 255, 0],
  [36, 255, 0],
  [49, 255, 0],
  [52, 255, 0],
  [76, 255, 0],
  [74, 255, 0],
  [72, 255, 0],
  [87, 255, 0],
  [87, 255, 0],
  [40, 255, 0],
  [25, 255, 0],
  [18, 255, 0],
  [26, 255, 0],
  [17, 255, 0],
  [24, 255, 0],
  [24, 255, 0],
  [25, 255, 0],
  [121, 255, 0],
  [220, 255, 0],
  [200, 255, 0],
  [66, 255, 0],
  [22, 255, 0],
  [19, 255, 0],
  [29, 255, 0],
  [6, 255, 0],
  [33, 255, 0],
  [9, 255, 0],
  [21, 255, 0],
  [32, 255, 0],
  [16, 255, 0],
  [17, 255, 0],
  [20, 255, 0],
  [23, 255, 0],
  [25, 255, 0],
  [28, 255, 0],
  [29, 255, 0],
  [25, 255, 0],
  [39, 255, 0],
  [34, 255, 0],
  [50, 255, 0],
  [51, 255, 0],
  [49, 255, 0],
  [67, 255, 0],
  [97, 255, 0],
  [83, 255, 0],
  [113, 255, 0],
  [131, 255, 0],
  [124, 255, 0],
  [118, 255, 0],
  [93, 255, 0],
  [86, 255, 0],
  [75, 255, 0],
  [64, 255, 0],
  [34, 255, 0],
  [26, 255, 0],
  [34, 255, 0],
  [29, 255, 0],
  [30, 255,

In [11]:
min, max = get_min_max_data_length(preprocessed_dataset)

In [12]:
min,max

(1000, 1000)

In [13]:
def segment_ecg(data, sampling_rate=100, window_duration=3, overlap_duration=0.5):
    """
    ECG 데이터를 주어진 윈도우 크기와 겹침을 고려하여 분할하는 함수.
    
    :param data: ECG 데이터 (1D array 또는 list)
    :param sampling_rate: 샘플링 속도 (Hz)
    :param window_duration: 윈도우 크기 (초) - 최소 3개의 ECG 파형을 포함할 수 있는 크기
    :param overlap_duration: 겹침 크기 (초) - 0.5초씩 겹침
    :return: 분할된 ECG 데이터 세그먼트 리스트
    """
    # 윈도우 크기와 겹침 크기를 샘플 단위로 변환
    window_size = int(window_duration * sampling_rate)  # 3초에 해당하는 샘플 개수
    overlap_size = int(overlap_duration * sampling_rate)  # 0.5초에 해당하는 샘플 개수

    # 분할된 데이터를 저장할 리스트
    segmented_data = []

    # 슬라이딩 윈도우로 데이터 분할
    start = 0
    while start + window_size <= len(data):
        end = start + window_size
        segmented_data.append(data[start:end])
        start += window_size - overlap_size  # 겹침을 고려한 다음 윈도우의 시작 지점

    return segmented_data

In [14]:
for item in preprocessed_dataset:
    ecg_data = item['data']  # ECG 데이터 가져오기
    segmented_ecg = segment_ecg(ecg_data)  # 분할된 데이터
    item['segmented_data'] = segmented_ecg  # 원래 데이터에 'segmented_data' 추가

In [15]:
len(preprocessed_dataset[0]['segmented_data'])

3

In [16]:
preprocessed_dataset[0]['label']

'N'

In [42]:
import torch
from torch.utils.data import Dataset

class ECGDataset(Dataset):
    def __init__(self, preprocessed_dataset):
        """
        ECG 데이터를 PyTorch Dataset으로 변환하는 클래스.
        각 dict의 'data'와 'label' 필드를 쌍으로 반환.
        
        :param preprocessed_dataset: 분할된 ECG 데이터가 포함된 dict 리스트
        """
        self.data_label_pairs = []

        # 각 데이터에 대해 분할된 segment와 label을 쌍으로 저장
        for item in preprocessed_dataset:
            ecg_data = item['data']  # ECG 데이터
            label = item['label']    # 레이블 (예: 정상, 비정상 등)
            segmented_ecg = segment_ecg(ecg_data)  # ECG 데이터를 분할
            segmented_ecg = np.asarray(segmented_ecg)
            # 분할된 segment와 label을 쌍으로 저장
            for segment in segmented_ecg:
                self.data_label_pairs.append((segment[:,0], label))

    def __len__(self):
        """데이터셋의 전체 길이를 반환"""
        return len(self.data_label_pairs)

    def __getitem__(self, idx):
        """
        인덱스에 해당하는 (segment, label) 쌍을 반환. PyTorch Dataloader가 이 메서드를 호출함.
        데이터를 텐서로 변환하여 반환.
        """
        ecg_segment, label = self.data_label_pairs[idx]
        
        ecg_tensor = torch.tensor(ecg_segment, dtype=torch.float32).unsqueeze(0)
        if label == 'N':
            label = 0
        elif label ==  'S':
            label = 1
        elif label == 'V':
            label = 2
        elif label ==  'Q' :
            label = 3
        else:
            label = 0
        
        label_tensor = torch.tensor(label, dtype=torch.long)  # 레이블은 보통 정수형
        return ecg_tensor, label_tensor


In [43]:
ecg_dataset = ECGDataset(preprocessed_dataset)
dataset_size = len(ecg_dataset)
train_size = int(0.8 * dataset_size)  # 80% for training
eval_size = dataset_size - train_size  # 20% for evaluation
train_dataset, eval_dataset = random_split(ecg_dataset, [train_size, eval_size])


# DataLoader로 변환하여 배치 단위로 데이터를 가져오도록 설정
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=False)


In [57]:
import torch.nn.functional as F
input_length = 342
class ECGCNN(nn.Module):
    def __init__(self, num_classes=4):
        super(ECGCNN, self).__init__()
        # 1D Convolutional layers
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=7, stride=1, padding=3)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2)
        
        # Pooling layers
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * (input_length // 8), 128)  # input_length는 각 segment의 길이
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        # 1D Convolution + Activation + Pooling
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        
        # Flatten
        x = x.view(x.size(0), -1)  # Flatten to (batch_size, num_features)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [58]:
def calculate_class_weights(dataloader, num_classes):
    # Initialize a count for each class
    class_counts = [0] * num_classes

    # Iterate over the dataset to count the occurrences of each class
    for _, labels in dataloader:
        for label in labels:
            class_counts[label.item()] += 1

    # Convert class counts to class weights: the inverse of the frequency
    total_samples = sum(class_counts)
    class_weights = [total_samples / count if count > 0 else 0 for count in class_counts]

    # Normalize the weights to sum to 1 (optional but commonly done)
    class_weights = np.array(class_weights)
    class_weights = class_weights / class_weights.sum()

    # Convert to a tensor and return
    return torch.tensor(class_weights, dtype=torch.float32)

In [59]:
num_classes = 4  # Example: 3 classes
class_weights = calculate_class_weights(train_dataloader, num_classes)
# print(calculate_class_weights(train_dataloader, num_classes))
# class_weights = calculate_class_weights(train_dataloader, num_classes).to(device)


In [60]:
class_weights

tensor([0.0061, 0.0035, 0.1812, 0.8092])

In [61]:
LEARNING_RATE = 0.0005
ecgcnn = ECGCNN().to('cuda')
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(ecgcnn.parameters(), lr=LEARNING_RATE)


In [62]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_epochs = 100

for epoch in range(num_epochs):
    # Training phase
    ecgcnn.train()  # Set model to training mode
    for ecg_segments, labels in train_dataloader:
        ecg_segments, labels = ecg_segments.to(device), labels.to(device)
        
        # Forward pass
        outputs = ecgcnn(ecg_segments)
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation phase
    ecgcnn.eval()  # Set model to evaluation mode
    eval_loss = 0.0
    correct = [0 for _ in range(num_classes)]
    total = [0 for _ in range(num_classes)]

    with torch.no_grad():  # No need to compute gradients during evaluation
        for ecg_segments, labels in eval_dataloader:
            ecg_segments, labels = ecg_segments.to(device), labels.to(device)
            
            # Forward pass
            outputs = ecgcnn(ecg_segments)
            loss = loss_fn(outputs, labels)
            eval_loss += loss.item()
            
            # Assuming classification task
            _, predicted = torch.max(outputs.data, 1)
            for i in range(num_classes):
              class_correct = (predicted == i) & (labels == i)
              correct[i] += class_correct.sum().item()
              total[i] += (labels == i).sum().item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    # Calculate and display accuracy for each class
    for i in range(num_classes):
        if total[i] > 0:
            accuracy = 100 * correct[i] / total[i]
            print(f"Class {i} Accuracy: {accuracy:.2f}%")
        else:
            print(f"Class {i} Accuracy: No samples")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x2368 and 2688x128)