In [None]:
# Inspired from 
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
%matplotlib inline

# Training a Temporal Convolutional Network on ISRUC-Sleep Dataset

In this tutorial, we will guide you through the process of training a Temporal Convolutional Network (TCN) for time series classification using the **ISRUC-Sleep** dataset. The steps outlined here focus on preparing the dataset, building the model, and training and evaluating the network.

### Prerequisites
Before starting, ensure that you have the following:
- A P100 GPU (or any other GPU) for model training.
- Python 3.x installed along with PyTorch, torchvision, and pandas libraries.

### Overview
We will go through the following steps:
1. **Load and Normalize the ISRUC Processed Dataset**: Use PyTorch utilities to handle data loading and preprocessing.
2. **Define the Temporal Convolutional Neural Network (TCN)**: Create a custom neural network for time series classification.
3. **Define the Loss Function**: Set up a loss function appropriate for classification.
4. **Train the Network**: Train the TCN on the training dataset.
5. **Evaluate the Network**: Test the trained model on the test dataset.

---

## 1. Loading and Normalizing the ISRUC Processed Dataset

### Dataset Overview

The **ISRUC-Sleep** dataset consists of time series data that has been preprocessed and converted into `.csv` files, which are ideal for time series classification tasks. You can find the dataset on the following platforms:

- [Original ISRUC-Sleep Dataset](https://sleeptight.isr.uc.pt/)
- [Kaggle Datacard](https://www.kaggle.com/datasets/rishitjakharia/isruc-sg1)
- [Processed Dataset on Kaggle](https://www.kaggle.com/datasets/rishitjakharia/isruc-processed)

For this tutorial, we will use the **processed version** of the dataset.

![Polysomnography for Apnea](https://www.cancertherapyadvisor.com/wp-content/uploads/sites/12/2019/01/ch2606.fig4_.png)

### Loading Data with Pandas

First, load the processed dataset into a Pandas DataFrame. Assume the data is stored in CSV format.


In [None]:
import pandas as pd

# Load training and testing datasets
train_data = pd.read_csv('/kaggle/input/isruc-processed/dataset/Events/oa/S1_p100_1_Stagen1_Event4_Session1.csv')

# Display basic information about the dataset
print(train_data.head())

### Converting Data to PyTorch Tensors

To feed the data into the neural network, we need to convert the Pandas DataFrames into PyTorch tensors. We'll use `torch.tensor()` for this conversion.

In [None]:
import torch

# Convert the training and testing data to PyTorch tensors
train_tensor = torch.tensor(train_data.values, dtype=torch.float32)

### Normalizing the Data

For neural networks to perform optimally, data normalization is essential. We'll use the mean and standard deviation of the training data to normalize both the training and testing datasets.

In [None]:
# Normalize the data by subtracting the mean and dividing by the standard deviation
mean = train_tensor.mean()
std = train_tensor.std()

train_tensor = (train_tensor - mean) / std

### Creating a Custom Dataset Class

We will create a custom PyTorch `Dataset` class that loads the data in batches. A Simplified version of the Dataset class implemented in the `.ipynb` file is shown below.

In [None]:
import os
import re
import numpy as np
from torch.utils.data import Dataset, DataLoader

class EventClassificationDataset(Dataset):
    def __init__(self, root_dir, transform=None, label_mapping=None, included_classes=None):
        """
        Args:
            root_dir (str): Root directory containing all folders and signal files.
            transform (callable, optional): Optional transform to apply to the signals.
            included_classes (list, optional): List of classes to include. If None, include all classes.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.included_classes = included_classes
        self.label_mapping = label_mapping
        self.data_info = self._prepare_file_list()

    def _prepare_file_list(self):
        """Scan the dataset directory and prepare a list of file paths and labels."""
        data_info = []

        event_path = os.path.join(self.root_dir, 'Events')
        if os.path.isdir(event_path):
            for label_folder in os.listdir(event_path):
                if self.included_classes and label_folder not in self.included_classes:
                    continue

                label_path = os.path.join(event_path, label_folder)
                if os.path.isdir(label_path):
                    for file_name in os.listdir(label_path):
                        # Full file path
                        file_path = os.path.join(label_path, file_name)
                        data_info.append((file_path, label_folder))

        # Process Non Event files
        non_event_path = os.path.join(self.root_dir, 'Non_Events')
        if os.path.isdir(non_event_path):
            for subfolder_name in os.listdir(non_event_path):
                subfolder_path = os.path.join(non_event_path, subfolder_name)
                if os.path.isdir(subfolder_path):
                    for file_name in os.listdir(subfolder_path):
                        # Full file path
                        file_path = os.path.join(subfolder_path, file_name)

                        # Label for non-events is "no_event"
                        if not self.included_classes or 'no_event' in self.included_classes:
                            data_info.append((file_path, 'no_event'))

        return data_info

    def __len__(self):
        return len(self.data_info)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get file path and label
        file_path, label = self.data_info[idx]

        # Convert label to one-hot encoding
        unique_labels = self.included_classes if self.included_classes else ['ar', 'awake', 'ca', 'ch', 'l on', 'l out', 
                                                                             'lm', 'mchg', 'mh', 'oa', 'oh', 'plm', 'rem', 
                                                                             'no_event']
        label_to_one_hot = {label: [1 if i == idx else 0 for i in range(len(unique_labels))] 
                            for idx, label in enumerate(unique_labels)}
        label = label_to_one_hot[label]

        possible_columns = [
            ['X6', 'X7', 'X8', 'SaO2'],
            ['X6', 'X7', 'X8', 'SpO2'],
            ['29', '30', '31', 'SaO2'],
            ['29', '30', '31', 'SpO2']
        ]
        
        signal = pd.read_csv(file_path)
    
        for columns in possible_columns:
            if all(col in signal.columns for col in columns):
                signal = signal[columns]
                break
        
        signal = torch.tensor(signal.values, dtype=torch.float32).T
        if self.label_mapping:
            label = self.label_mapping[np.argmax(label)]
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            for transform in self.transform:
                signal = transform(signal)

        return signal.T, label

The label mapping defined as done below, maps the 6 classes of types of apnea and hypopnea events into two classes namely `A/H Event` or `Normal Breathing`.

Furthermore, the index of the classes is the same as specified in `included_classes`. Hence, if we wished to convert the 6 class output to 3 classes namely `Apnea`, `Hypopnea` and `Normal Breathing` we would use a code similar to the one showed below in the markdown.

```python
label_mapping = [[1,0,0], [1,0,0], 
                 [0,1,0], [0,1,0], [0,1,0],
                 [0,0,1]
                ]
```

In [None]:
root_dir = '/kaggle/input/isruc-processed/dataset'
included_classes = ['ca', 'oa', 'oh', 'mh', 'ch', 'no_event']

label_mapping = [[1,0], [1,0], 
                 [1,0], [1,0], [1,0],
                 [0,1]
                ]

event_classification_dataset = EventClassificationDataset(root_dir=root_dir, label_mapping=label_mapping, included_classes=included_classes)
event_classification_loader = DataLoader(event_classification_dataset, batch_size=64, shuffle=True)

### Creating a Train Test Split

Since we are dealing with Medical Data, we must ensure that there is no data leakage between the train and test subsets of the dataset to achieve this we use the patient information stored in the name of the dataset. 
> The structure is specified in the discription of the dataset.

In [None]:
from collections import defaultdict
from sklearn.model_selection import train_test_split

patient_session_groups = defaultdict(list)

for idx, (_, label) in enumerate(event_classification_dataset.data_info):
    filename = event_classification_dataset.data_info[idx][0]
    filename = filename.split("/")[-1]
    patient_session_key = "_".join(filename.split("_")[:2])  # e.g., "S1_p_1"
    patient_session_groups[patient_session_key].append(idx)

group_keys = list(patient_session_groups.keys())
train_group_keys, test_group_keys = train_test_split(
    group_keys, test_size=0.2, random_state=14
)

# Get train and test indices from groups
train_idx = [idx for key in train_group_keys for idx in patient_session_groups[key]]
test_idx = [idx for key in test_group_keys for idx in patient_session_groups[key]]

# Create train and test subsets
train_subset = torch.utils.data.Subset(event_classification_dataset, train_idx)
test_subset = torch.utils.data.Subset(event_classification_dataset, test_idx)

In [None]:
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_subset, batch_size=batch_size, shuffle=False)

print(f"Number of training samples: {len(train_subset)}")
print(f"Number of testing samples: {len(test_subset)}")

---

## 2. Defining the Temporal Convolutional Network (TCN)

The Temporal Convolutional Network (TCN) is an effective architecture for time-series data, using dilated convolutions to model long-range dependencies. 

Below is a simplified version of the complete Temporal Conv Net used for the classification.

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TCNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation_rate):
        super(TCNBlock, self).__init__()

        self.conv1 = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size,
            padding='same',
            dilation=dilation_rate
        )
        self.batch_norm = nn.BatchNorm1d(out_channels)

        # Residual connection
        if in_channels != out_channels:
            self.shortcut = nn.Conv1d(in_channels, out_channels, kernel_size=1)
        else:
            self.shortcut = None

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.batch_norm(out)

        shortcut = self.shortcut(x) if self.shortcut is not None else x
        return F.relu(out + shortcut)

class TCNModel(nn.Module):
    def __init__(self, n_length, n_features, n_outputs):
        super(TCNModel, self).__init__()

        self.tcn_blocks = nn.Sequential(
            TCNBlock(n_features, 32, kernel_size=3, dilation_rate=1),
            TCNBlock(32, 64, kernel_size=3, dilation_rate=2)
        )

        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, n_outputs)

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Convert to (batch_size, n_features, n_length)
        x = self.tcn_blocks(x)

        x = self.global_avg_pool(x).squeeze(-1)  # Global average pooling

        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return F.softmax(x, dim=1)

---

## 3. Defining the Loss Function

For this demonstration, we will use **cross-entropy loss**, which is common for multi-class classification problems. However, we have used custom loss function in the final code.

In [None]:
# Define the loss function (Cross Entropy for classification)
criterion = nn.CrossEntropyLoss()

---

## 4. Training the Network

We'll now proceed to train the network. During training, we will use an optimizer (e.g., Adam) to update the model parameters.

In [None]:
import torch.optim as optim

# Initialize the model, loss function, and optimizer
n_timesteps = 30 * 12.5 # Seconds Per Epoch * Sampling Frequency
n_features = 4 # No. of features used
n_outputs = 2

# Create the model
model = TCNModel(n_length=n_timesteps, n_features=n_features, n_outputs=n_outputs)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch_data, batch_labels in train_loader:
        batch_labels = batch_labels.float()  # Ensure labels are in the correct dtype
        
        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

---

## 5. Evaluating the Network

After training, it's crucial to evaluate the model on unseen data. We will check the model's performance on the test set.

In [None]:
# Evaluate the model on the test set
model.eval()  # Set model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for batch_data, batch_labels in test_loader:
        batch_labels = batch_labels.float()

        # Forward pass
        outputs = model(batch_data)
        _, predicted = torch.max(outputs, 1)
        
        # Calculate accuracy
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

---

## Conclusion

In this tutorial, we demonstrated how to:
- Load and preprocess the ISRUC-Sleep dataset.
- Define a Temporal Convolutional Network (TCN) for time series classification.
- Train the network and evaluate its performance.

By following these steps, you can use PyTorch to build and train time-series classifiers on similar datasets.