In [1]:
import os
import math
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedGroupKFold
import pyarrow.parquet as pq
import keras_cv
from keras_cv.layers import MixUp, RandomCutout
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.losses import KLDivergence
from keras.callbacks import LearningRateScheduler, ModelCheckpoint


2024-04-05 17:41:32.060663: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-05 17:41:32.060761: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-05 17:41:32.194154: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Set random seed
tf.random.set_seed(42)

In [3]:
# Configuration
class Config:
    verbose = 1
    seed = 42
    preset = "efficientnetv2_b2_imagenet"
    image_size = [400, 300]
    epochs = 13
    batch_size = 64
    lr_mode = "cos"
    drop_remainder = True
    num_classes = 6
    fold = 0
    class_names = ['Seizure', 'LPD', 'GPD', 'LRDA', 'GRDA', 'Other']
    label2name = dict(enumerate(class_names))
    name2label = {v: k for k, v in label2name.items()}

In [4]:
# Dataset Path
BASE_PATH = "/kaggle/input/hms-harmful-brain-activity-classification"
SPEC_DIR = "/tmp/dataset/hms-hbac"
os.makedirs(SPEC_DIR + '/train_spectrograms', exist_ok=True)
os.makedirs(SPEC_DIR + '/test_spectrograms', exist_ok=True)

In [5]:
# Load Metadata
df = pd.read_csv(f'{BASE_PATH}/train.csv')
df['eeg_path'] = f'{BASE_PATH}/train_eegs/' + df['eeg_id'].astype(str) + '.parquet'
df['spec_path'] = f'{BASE_PATH}/train_spectrograms/' + df['spectrogram_id'].astype(str) + '.parquet'
df['spec2_path'] = f'{SPEC_DIR}/train_spectrograms/' + df['spectrogram_id'].astype(str) + '.npy'
df['class_name'] = df.expert_consensus.copy()
df['class_label'] = df.expert_consensus.map(Config.name2label)


In [6]:
# Load Test Metadata
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
test_df['eeg_path'] = f'{BASE_PATH}/test_eegs/' + test_df['eeg_id'].astype(str) + '.parquet'
test_df['spec_path'] = f'{BASE_PATH}/test_spectrograms/' + test_df['spectrogram_id'].astype(str) + '.parquet'
test_df['spec2_path'] = f'{SPEC_DIR}/test_spectrograms/' + test_df['spectrogram_id'].astype(str) + '.npy'


In [7]:
# Function to process a single spectrogram
def process_spec(spec_id, split="train"):
    spec_path = f"{BASE_PATH}/{split}_spectrograms/{spec_id}.parquet"
    spec = pd.read_parquet(spec_path).fillna(0).values[:, 1:].T.astype("float32")
    np.save(f"{SPEC_DIR}/{split}_spectrograms/{spec_id}.npy", spec)


In [8]:
# Process train spectrograms in parallel
spec_ids = df["spectrogram_id"].unique()
joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "train") for spec_id in tqdm(spec_ids, total=len(spec_ids))
)

  0%|          | 0/11138 [00:00<?, ?it/s]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [9]:
# Process test spectrograms in parallel
test_spec_ids = test_df["spectrogram_id"].unique()
joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "test") for spec_id in tqdm(test_spec_ids, total=len(test_spec_ids))
)


  0%|          | 0/1 [00:00<?, ?it/s]

[None]

In [10]:
# Function to build augmenter
def build_augmenter(dim=Config.image_size):
    augmenters = [
        MixUp(alpha=2.0),
        RandomCutout(height_factor=(1.0, 1.0), width_factor=(0.06, 0.1)),  # freq-masking
        RandomCutout(height_factor=(0.06, 0.1), width_factor=(1.0, 1.0))  # time-masking
    ]
    
    def augment(img, label):
        data = {"images": img, "labels": label}
        for augmenter in augmenters:
            if tf.random.uniform([]) < 0.5:
                data = augmenter(data, training=True)
        return data["images"], data["labels"]
    
    return augment

In [11]:
# Function to build dataset
def build_dataset(paths, offsets=None, labels=None, batch_size=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=False, repeat=True, shuffle=1024, 
                  cache_dir="", drop_remainder=False):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter()
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = (paths, offsets) if labels is None else (paths, offsets, labels)
    
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls=AUTO)
    ds = ds.cache(cache_dir) if cache else ds
    ds = ds.repeat() if repeat else ds
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=Config.seed)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.map(augment_fn, num_parallel_calls=AUTO) if augment else ds
    ds = ds.prefetch(AUTO)
    return ds

In [12]:
# Function to decode with labels
def build_decoder(with_labels=True, target_size=Config.image_size, dtype=32):
    def decode_signal(path, offset=None):
        file_bytes = tf.io.read_file(path)
        sig = tf.io.decode_raw(file_bytes, tf.float32)
        sig = sig[1024//dtype:]  # Remove header tag
        sig = tf.reshape(sig, [400, -1])
        
        if offset is not None: 
            offset = offset // 2
            sig = sig[:, offset:offset+300]
            pad_size = tf.math.maximum(0, 300 - tf.shape(sig)[1])
            sig = tf.pad(sig, [[0, 0], [0, pad_size]])
            sig = tf.reshape(sig, [400, 300])
        
        sig = tf.clip_by_value(sig, tf.math.exp(-4.0), tf.math.exp(8.0))
        sig = tf.math.log(sig)
        sig -= tf.math.reduce_mean(sig)
        sig /= tf.math.reduce_std(sig) + 1e-6
        sig = tf.tile(sig[..., None], [1, 1, 3])
        return sig
    
    def decode_label(label):
        label = tf.one_hot(label, Config.num_classes)
        label = tf.cast(label, tf.float32)
        label = tf.reshape(label, [Config.num_classes])
        return label
    
    def decode_with_labels(path, offset=None, label=None):
        sig = decode_signal(path, offset)
        label = decode_label(label)
        return (sig, label)
    
    return decode_with_labels if with_labels else decode_signal

In [13]:
# Function to get learning rate callback
def get_lr_callback(batch_size=8, mode='cos', epochs=10, plot=False):
    lr_start, lr_max, lr_min = 5e-5, 6e-6 * batch_size, 1e-5
    lr_ramp_ep, lr_sus_ep, lr_decay = 3, 0, 0.75

    def lrfn(epoch):
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch'); plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return LearningRateScheduler(lrfn, verbose=False)

In [14]:
# Stratified Group K-Fold
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=Config.seed)
df["fold"] = -1
df.reset_index(drop=True, inplace=True)
for fold, (train_idx, valid_idx) in enumerate(sgkf.split(df, y=df["class_label"], groups=df["patient_id"])):
    df.loc[valid_idx, "fold"] = fold

In [15]:
# Build Train & Valid Dataset
sample_df = df.groupby("spectrogram_id").head(1).reset_index(drop=True)
train_df = sample_df[sample_df.fold != Config.fold]
valid_df = sample_df[sample_df.fold == Config.fold]

train_paths = train_df.spec2_path.values
train_offsets = train_df.spectrogram_label_offset_seconds.values.astype(int)
train_labels = train_df.class_label.values
train_ds = build_dataset(train_paths, train_offsets, train_labels, batch_size=Config.batch_size,
                         repeat=True, shuffle=True, augment=True, cache=True)

valid_paths = valid_df.spec2_path.values
valid_offsets = valid_df.spectrogram_label_offset_seconds.values.astype(int)
valid_labels = valid_df.class_label.values
valid_ds = build_dataset(valid_paths, valid_offsets, valid_labels, batch_size=Config.batch_size,
                         repeat=False, shuffle=False, augment=False, cache=True)


In [16]:
# Build Classifier
model = keras_cv.models.ImageClassifier.from_preset(
    Config.preset, num_classes=Config.num_classes
)


Attaching 'config.json' from model 'keras/efficientnetv2/keras/efficientnetv2_b2_imagenet/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/efficientnetv2/keras/efficientnetv2_b2_imagenet/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/efficientnetv2/keras/efficientnetv2_b2_imagenet/2' to your Kaggle notebook...


In [17]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4),
              loss=KLDivergence())


In [18]:
# Model Checkpointing
ckpt_cb = ModelCheckpoint("best_model.keras",
                          monitor='val_loss',
                          save_best_only=True,
                          save_weights_only=False,
                          mode='min')


In [19]:
# Training
history = model.fit(
    train_ds,
    epochs=Config.epochs,
    callbacks=[get_lr_callback(Config.batch_size, mode=Config.lr_mode), ckpt_cb],
    steps_per_epoch=len(train_df)//Config.batch_size,
    validation_data=valid_ds,
    verbose=Config.verbose
)

# Load Best Model
model.load_weights("best_model.keras")

Epoch 1/13


I0000 00:00:1712339281.900306     116 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 702ms/step - loss: 1.4362 - val_loss: 1.3628 - learning_rate: 5.0000e-05
Epoch 2/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 510ms/step - loss: 1.1365 - val_loss: 1.0718 - learning_rate: 1.6133e-04
Epoch 3/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 505ms/step - loss: 0.9840 - val_loss: 1.0047 - learning_rate: 2.7267e-04
Epoch 4/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 508ms/step - loss: 0.8861 - val_loss: 0.9367 - learning_rate: 3.8400e-04
Epoch 5/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 508ms/step - loss: 0.8321 - val_loss: 0.8893 - learning_rate: 3.7485e-04
Epoch 6/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 507ms/step - loss: 0.7855 - val_loss: 0.8597 - learning_rate: 3.4829e-04
Epoch 7/13
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 507ms/step - loss: 0.7261 

In [20]:
# Build Test Dataset
test_paths = test_df.spec2_path.values
test_ds = build_dataset(test_paths, batch_size=min(Config.batch_size, len(test_df)),
                        repeat=False, shuffle=False, cache=False, augment=False)


In [21]:
# Inference
preds = model.predict(test_ds)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step


In [22]:
# Submission
pred_df = test_df[["eeg_id"]].copy()
target_cols = [x.lower() + '_vote' for x in Config.class_names]
pred_df[target_cols] = preds.tolist()

sub_df = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')
sub_df = sub_df[["eeg_id"]].copy()
sub_df = sub_df.merge(pred_df, on="eeg_id", how="left")
sub_df.to_csv("/kaggle/working/submission.csv", index=False)

sub_df.head()

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.016673,0.194937,0.000628,0.682673,0.010801,0.094289
