In [1]:
# the folder train eegs has a lot of parquet files. Read each of them and store the results in a dataframe
import pandas as pd
import pyarrow.parquet as pq
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import polars as pl
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
from dask.distributed import Client
from tqdm import tqdm

tqdm.pandas()
# Read data
train = pd.read_csv('../data/raw/train.csv')

In [2]:
train.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [3]:
# For each row in train, read the corresponding eeg file and extract the 50*200 samples from each eeg_label offset using groupby
# and apply
import pickle

def get_eegs(x, all_eegs, moving_max):
    eeg = pq.read_table(f"../data/raw/train_eegs/{x.eeg_id.iloc[0]}.parquet").to_pandas()
    all_eegs[x.eeg_id.iloc[0]] = eeg
    moving_max[0] = max(len(eeg), moving_max[0])



all_eegs = dict()
moving_max = np.array([0])
# read the spectrgormas per eeg id and extract all the 50*200 samples starting from each eeg label offset
# If the pickle file exists, load it, otherwise, create it
if os.path.exists('all_eegs.pkl'):
    all_eegs = pickle.load(open('all_eegs.pkl', 'rb'))
else:
    train.groupby('eeg_id').progress_apply(lambda x: get_eegs(x, all_eegs, moving_max))
print(moving_max)



100%|██████████| 17089/17089 [01:03<00:00, 267.76it/s]

[684400]





In [4]:
# Make a pytorch dataset
from scipy.signal import decimate
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import time

class EEGDataset(Dataset):
    def __init__(self, all_eegs, metadata):
        self.all_eegs = all_eegs
        self.metadata = metadata
        self.column_names = None
        self.label_names = None

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):

        # get the eeg id from the idx in the metadata
        eeg_id = self.metadata.iloc[idx]['eeg_id']
        eeg_label_offset_seconds = int(self.metadata.iloc[idx]['eeg_label_offset_seconds'])
        eeg = self.all_eegs[eeg_id]
        eeg = eeg.iloc[eeg_label_offset_seconds*200:eeg_label_offset_seconds*200 + 50 * 200, :]
        self.column_names = eeg.columns
        # set nans in eegto 0 if there are any
        eeg = eeg.fillna(0)
        self.label_names = self.metadata.columns[-6:]
        labels = self.metadata.iloc[idx, -6:]
        labels /= sum(labels)
        eeg_arr = eeg.to_numpy(dtype=np.float32)
        # normlize the eegs
        eeg_arr -= np.mean(eeg_arr, axis=0)
        eeg_arr /= (np.std(eeg_arr, axis=0) + 1e-8)
        labels_arr = labels.to_numpy(dtype=np.float32)
        return torch.from_numpy(eeg_arr), torch.from_numpy(labels_arr)

In [5]:
eeg_dataset = EEGDataset(all_eegs, train)

In [13]:
# make a simple 1d cnn 
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN1DClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CNN1DClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=20, out_channels=64, kernel_size=5, stride=1, padding=2)
        self.pool = nn.MaxPool1d(kernel_size=4, stride=4)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=2)
        self.fc1 = nn.Linear(256 * 156, 1024)  # Adjust the input size according to your architecture
        self.fc2 = nn.Linear(1024, num_classes)
        
    def forward(self, x):
        x1 = self.pool(F.relu(self.conv1(x)))
        x2 = self.pool(F.relu(self.conv2(x1)))
        x3 = self.pool(F.relu(self.conv3(x2)))
        x4 = x3.view(-1, 256 * 156)  # Flatten the output for the fully connected layer
        x5 = F.relu(self.fc1(x4))
        x6 = self.fc2(x5)
        x6 = torch.softmax(x6, dim=1)
        return x6

# Example usage
num_classes = 6  # For example, 10 classes for classification
model = CNN1DClassifier(num_classes=num_classes)
model.to('cuda')  # Move the model to the GPU
# Print the model
print(model)
model = nn.DataParallel(model)

CNN1DClassifier(
  (conv1): Conv1d(20, 64, kernel_size=(5,), stride=(1,), padding=(2,))
  (pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv3): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=(2,))
  (fc1): Linear(in_features=39936, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=6, bias=True)
)


In [14]:
from typing import Optional
def kl_divergence_pytorch(solution: torch.Tensor, submission: torch.Tensor, epsilon: float = 10**-15, sample_weights: Optional[torch.Tensor]=None) -> torch.Tensor:
    # Ensure inputs are float tensors
    solution = solution.float()
    submission = submission.float()
    
    # Clip submission predictions to avoid division by zero or log of zero
    submission = torch.clamp(submission, epsilon, 1 - epsilon)
    
    # Calculate KL divergence only where solution is non-zero
    y_nonzero_indices = solution != 0
    kl_div = torch.zeros_like(solution)
    kl_div[y_nonzero_indices] = solution[y_nonzero_indices] * torch.log(solution[y_nonzero_indices] / submission[y_nonzero_indices])
    
    # If sample_weights is provided, compute weighted average; else, compute mean

    return torch.mean(torch.sum(kl_div, dim=1))

from kaggle_metric import score

In [44]:
# Train the model
import torch.optim as optim
from torch.utils.data import DataLoader
import time
import copy

# Hyperparameters
num_epochs = 5
learning_rate = 0.05
batch_size = 512

# Loss and optimizer
criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Data loader
train_loader = DataLoader(dataset=eeg_dataset, batch_size=batch_size, shuffle=True)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    losses = []
    kaggle_losses = []
    preds = []
    pbar = tqdm(train_loader, unit="batch", desc=f"Epoch {epoch} Train")
    for batch in pbar:
        X_batch, y_batch = batch
        X_batch = X_batch.to('cuda').float()
        y_batch = y_batch.to('cuda').float()

        # Forward pass
        y_pred = model(X_batch.permute(0,2,1)).squeeze(1)
        preds.extend(torch.clamp(y_pred, min=10**-15, max=1-10**-15).cpu().detach().numpy())
        loss = criterion(torch.log(torch.clamp(y_pred, min=10**-15, max=1-10**-15)), y_batch)
        kaggle_loss = kl_divergence_pytorch(y_batch, y_pred, epsilon=10**-15) 
        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()

        # Print tqdm
        losses.append(loss.item())
        kaggle_losses.append(kaggle_loss.item())
        # Set the precision o the postfix to 4 decimals
        pbar.set_postfix(loss="{:.4f}".format(sum(losses) / len(losses)), kaggle_loss="{:.4f}".format(sum(kaggle_losses) / len(kaggle_losses)))
    
    preds_df = pd.DataFrame(preds, columns=eeg_dataset.label_names)
    # Calculate the score
    solution = train.iloc[:, -6:].reset_index(drop=True)
    solution_numpy = np.copy(solution.to_numpy(dtype=np.float32))
    solution_numpy_norm = np.copy(solution_numpy) / np.sum(solution_numpy, axis=1)[:, np.newaxis]
    print(kl_divergence_pytorch(torch.from_numpy(solution_numpy_norm), torch.from_numpy(preds_df.to_numpy(dtype=np.float32)), epsilon=10**-15))
    # Add dummy columns for the kaggle code
    preds_df['id'] = range(len(preds_df))
    solution['id'] = range(len(solution))
    id_col = solution.pop('id')
    solution = solution.div(solution.sum(axis=1), axis=0)
    solution['id'] = id_col
    print(score(copy.deepcopy(solution), preds_df, row_id_column_name='id', micro_average=True))
    # Remove the cuda cache
    torch.cuda.empty_cache()



Epoch 0 Train: 100%|██████████| 209/209 [03:47<00:00,  1.09s/batch, kaggle_loss=27.2236, loss=27.2236]


tensor(26.9383)
26.938251501884967


Epoch 1 Train: 100%|██████████| 209/209 [03:46<00:00,  1.09s/batch, kaggle_loss=27.2239, loss=27.2239]


tensor(26.9382)
26.938206295903846


Epoch 2 Train: 100%|██████████| 209/209 [03:47<00:00,  1.09s/batch, kaggle_loss=27.2246, loss=27.2246]


tensor(26.9386)
26.938612326007505


Epoch 3 Train: 100%|██████████| 209/209 [03:49<00:00,  1.10s/batch, kaggle_loss=27.2219, loss=27.2219]


tensor(26.9547)
26.954692815407228


Epoch 4 Train: 100%|██████████| 209/209 [03:50<00:00,  1.10s/batch, kaggle_loss=27.2226, loss=27.2226]


tensor(26.9570)
26.956994153684626
