In [3]:
import os
import numpy as np
import pandas as pd
import mne
from mne.preprocessing import ICA
import pywt
from scipy import signal
from scipy.interpolate import Rbf
from PIL import Image
from tqdm import tqdm
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")
mne.set_log_level("WARNING") 

In [4]:
label_num = {'Seizure': 0, 'GPD': 1, 'LPD': 2, 'GRDA': 3, 'LRDA': 4, 'Other': 5}
channel_names = ['Fp1','F3','C3','P3','F7','T3','T5','O1','Fz','Cz','Pz','Fp2','F4','C4','P4','F8','T4','T6','O2']
channel_types = ['eeg'] * len(channel_names)
sfreq = 200

channel_indices = {name: idx for idx, name in enumerate(channel_names)}

grid_structure = [
    [0,    'Fp1', 0,    'Fp2', 0],
    ['F7', 'F3',  'Fz', 'F4',  'F8'],
    ['T3', 'C3',  'Cz', 'C4',  'T4'],
    ['T5', 'P3',  'Pz', 'P4',  'T6'],
    [0,    'O1',  0,    'O2',  0]
]

In [5]:
def signal_processing(a,b):
    eeg = pd.read_parquet("/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/"+str(a)+".parquet")
    eeg = eeg[int(b)*200:int(50+b)*200] #[start, stop, step] #(n_samples, n_channels)
    eeg.fillna(eeg.mean(), inplace = True) 
    eeg_data = eeg.iloc[:,:-1].to_numpy().T  #(n_channels, n_samples)
    
    info = mne.create_info(ch_names = channel_names, sfreq = sfreq, ch_types = channel_types)
    raw = mne.io.RawArray(eeg_data, info)
    std_montage = mne.channels.make_standard_montage('standard_1020')
    raw.set_montage(std_montage)

    raw.notch_filter(60)

    raw.filter(l_freq = 1, h_freq = 50, method = 'iir', iir_params = None)

    raw_data = raw.get_data() 
    raw_times = raw.times
    info = raw.info 
    normalized_eeg = mne.baseline.rescale(raw_data, raw_times, baseline = (None, None), mode = 'mean')
    normalized_eeg = mne.io.RawArray(normalized_eeg, info)

    return normalized_eeg

In [6]:
def wavelet_enhanced_ica(normalized_eeg):
    ica = mne.preprocessing.ICA(n_components = 19, method = 'infomax', fit_params = dict(extended=True), max_iter = 'auto')
    ica.fit(normalized_eeg.copy())
    
    data = normalized_eeg.get_data()
    mixing_matrix = ica.mixing_matrix_
    unmixing_matrix = ica.unmixing_matrix_
    pca_components = ica.pca_components_
    pca_mean = ica.pca_mean_
    n_pca_components = ica.n_components_

    data_centered = data - pca_mean[:, np.newaxis]
    pca_data = np.dot(pca_components[:n_pca_components], data_centered)
    ica_sources = np.dot(unmixing_matrix, pca_data)

    denoised_components = np.zeros(data.shape)
    wavelet_family = 'db4'
    resolution = 5
    
    for i in range(19):
        coefficients = pywt.wavedec(ica_sources[i], wavelet = wavelet_family, level = resolution)
        sigma = (1/0.6745) * np.median(np.abs(coefficients[5] - np.median(coefficients[5]))) 
        threshold = sigma * np.sqrt(2 * np.log(len(coefficients[5])/sfreq)) 
        coefficients[1:] = [np.where(np.abs(coeff) > threshold, 0, coeff) for coeff in coefficients[1:]]
        denoised_components[i] = pywt.waverec(coefficients, wavelet = wavelet_family)
        
    ica_reconstructed = np.dot(mixing_matrix, denoised_components)
    reconstructed_data = np.dot(pca_components[:n_pca_components].T, ica_reconstructed)
    reconstructed_data += pca_mean[:, np.newaxis] # numpy.ndarray:shape(19, 10000):data.shape

    return reconstructed_data # (n_channels, n_samples)

In [7]:
def feature_maps(eeg_data):
    representation_matrix = np.zeros((10000, 5, 5))
    for t in range(10000):
        for i in range(5):
            for j in range(5):
                channel = grid_structure[i][j]
                if channel != 0: 
                    representation_matrix[t, i, j] = eeg_data[channel_indices[channel], t]
    
    rbf_interpolated_data = np.zeros_like(representation_matrix)
    x_grid, y_grid = np.meshgrid(np.arange(5), np.arange(5))
    known_positions = []
    known_values = []

    for t in range(10000):
        known_positions = []
        known_values = []
        for i in range(5):
            for j in range(5):
                if grid_structure[i][j] != 0:  
                    known_positions.append([i, j])
                    known_values.append(representation_matrix[t, i, j])
        known_positions = np.array(known_positions)
        known_values = np.array(known_values)
        if len(known_values) == 0:
            continue
        rbf = Rbf(known_positions[:, 0], known_positions[:, 1], known_values, function='gaussian')
        for i in range(5):
            for j in range(5):
                if grid_structure[i][j] == 0: 
                    rbf_interpolated_data[t, i, j] = rbf(i, j)
                else: 
                    rbf_interpolated_data[t, i, j] = representation_matrix[t, i, j]
                    
    feature_maps = np.zeros((10000, 64, 64))
    dim = (64, 64)
    for i in range(10000):
        image = Image.fromarray(rbf_interpolated_data[i])
        resized_map = image.resize(dim, Image.Resampling.BICUBIC)
        resized_map_array = np.array(resized_map)
        feature_maps[i] = resized_map_array

    extended_feature_maps = np.expand_dims(feature_maps, axis=1)
    
    window_size = 200
    T, C, H, W = extended_feature_maps.shape
    assert T % window_size == 0, "Time dimension must be divisible by window_size"
    T_new = T // window_size
    reduced_feature_map = np.median(extended_feature_maps.reshape(T_new, window_size, C, H, W), axis=1)
    
    normalized_feature_map = np.empty_like(reduced_feature_map, dtype=np.float32)
    
    for i in range(reduced_feature_map.shape[0]):
        img = reduced_feature_map[i, 0]
        min_val = np.min(img)
        max_val = np.max(img)
        if max_val > min_val:
            norm_img = (img - min_val) / (max_val - min_val)
        else:
            norm_img = img  
        normalized_feature_map[i, 0] = norm_img
        
    return normalized_feature_map

In [6]:
# import os
# import shutil

# folder_path = '/kaggle/working/Train_1'

# if os.path.exists(folder_path):
#     if os.path.isdir(folder_path):
#         shutil.rmtree(folder_path)  # Deletes the folder and all its contents
#         print(f"{folder_path} has been deleted.")
#     else:
#         print(f"{folder_path} is not a directory.")
# else:
#     print(f"{folder_path} does not exist.")

/kaggle/working/Train_1 has been deleted.


In [8]:
train_df = pd.read_csv('/kaggle/input/csv-files/train_sampled.csv', usecols=['eeg_id', 'eeg_label_offset_seconds', 'eeg_sub_id', 'expert_consensus'])

total_rows = len(train_df)

split1 = total_rows // 3
split2 = 2 * (total_rows // 3)

df_part1 = train_df.iloc[:split1]  
df_part2 = train_df.iloc[split1:split2]  
df_part3 = train_df.iloc[split2:]  

half = len(df_part1) // 2
part1_1 = df_part1[:half]
part1_2 = df_part1[half:]

In [10]:
l = len(part1_2) // 2
part1_2_1 = part1_2[:l]
part1_2_2 = part1_2[l:]

In [11]:
# test_df = pd.read_csv('/kaggle/input/csv-files/test_sampled.csv', usecols=['eeg_id', 'eeg_label_offset_seconds', 'eeg_sub_id', 'expert_consensus'])

output_dir = '/kaggle/working/Train_1_2_2' 
os.makedirs(output_dir, exist_ok = True)


def process_eeg(eeg_id, df, output_dir, label_num):
    eeg_data = df[df["eeg_id"] == eeg_id]
    for _, row in eeg_data.iterrows():
        a = row["eeg_id"]
        b = int(row["eeg_label_offset_seconds"])
        c = row["eeg_sub_id"]
        label = label_num[row['expert_consensus']]
        
        normalized_eeg = signal_processing(a, b)
        artifact_corrected_data = wavelet_enhanced_ica(normalized_eeg)  # (19, 10000)
        reduced_feature_maps = feature_maps(artifact_corrected_data)            # (50, 1, 64, 64)

        np.savez(f"{output_dir}/{a}_{c}_feature_maps.npz", features = reduced_feature_maps, label = np.array([label]))

Parallel(n_jobs=4)(delayed(process_eeg)(eeg_id, part1_2_2, output_dir, label_num) for eeg_id in tqdm(part1_2_2["eeg_id"].unique()))

100%|██████████| 834/834 [3:06:22<00:00, 13.41s/it]  


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [12]:
!zip -r Train_1_2_2.zip Train_1_2_2

  adding: Train_1_2_2/ (stored 0%)
  adding: Train_1_2_2/3127951892_45_feature_maps.npz (deflated 11%)
  adding: Train_1_2_2/2860052642_492_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/3572672408_44_feature_maps.npz (deflated 11%)
  adding: Train_1_2_2/4072150916_3_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/4233815620_105_feature_maps.npz (deflated 11%)
  adding: Train_1_2_2/2259539799_589_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/429744439_22_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/2259539799_421_feature_maps.npz (deflated 11%)
  adding: Train_1_2_2/2922201798_2_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/1596590162_65_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/2654556255_3_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/2436005794_0_feature_maps.npz (deflated 12%)
  adding: Train_1_2_2/1309740482_7_feature_maps.npz (deflated 11%)
  adding: Train_1_2_2/4233815620_57_feature_maps.npz (deflated 12%)
  adding: Train

In [13]:
from IPython.display import FileLink 
FileLink(r'Train_1_2_2.zip')

In [5]:
import os
import shutil

folder_path = '/kaggle/working/Train_1_2'

if os.path.exists(folder_path):
    if os.path.isdir(folder_path):
        shutil.rmtree(folder_path)  # Deletes the folder and all its contents
        print(f"{folder_path} has been deleted.")
    else:
        print(f"{folder_path} is not a directory.")
else:
    print(f"{folder_path} does not exist.")

/kaggle/working/Train_1_2 has been deleted.


In [2]:
import os

file_path = '/kaggle/working/Train_1_2_1.zip'

if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been deleted.")
else:
    print(f"{file_path} does not exist.") 

/kaggle/working/Train_1_2_1.zip has been deleted.


In [1]:
import os
print(os.path.exists('/kaggle/working/Train_1.zip'))

True
