In [2]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from hjorth import Hjorth
from paths import paths

In [2]:
def smoother(fn, skip=5):
    id = fn.split('.')[0]
    eeg = pd.read_parquet(paths.TRAIN_EEGS + fn)
    numrows = eeg.shape[0]
    smoothed_eeg = pd.DataFrame()
    for col in eeg.columns:
        given = np.array(eeg[col])
        array = [np.nanmean(given[i: i+skip]) for i in range(0, numrows, skip)]
        smoothed_eeg[col] = np.nan_to_num(array)
    np.save(f'cleaned_train_eegs_{skip}/{id}.npy', smoothed_eeg)

In [3]:
def parallel_smoother(skip):
    for _, _, files in os.walk("./train_eegs"):
        Parallel(n_jobs=-1)(delayed(smoother)(fn, skip) for fn in tqdm(files))

In [22]:
def custom_smoother(fn, skip=20):
    id = fn.split('.')[0]
    eeg = pd.DataFrame(np.load(paths.TRAIN_CLEAN_10 + id + '.npy'))
    numrows = eeg.shape[0]
    smoothed_eeg = pd.DataFrame()
    for col in eeg.columns:
        given = np.array(eeg[col])
        smoothed_eeg[col] = [np.mean(given[i: i+2]) for i in range(0, numrows, 2)]
    np.save(f'cleaned_train_eegs_{skip}/{id}.npy', smoothed_eeg)

In [23]:
def custom_parallel_smoother(skip):
    for _, _, files in os.walk('./train_eegs'):
        Parallel(n_jobs=-1)(delayed(custom_smoother)(fn, skip) for fn in tqdm(files))

In [32]:
from scipy.fft import fft

def generalized(eeg):
    left = None
    right = None
    for key in paths.LABEL_SIDES.keys():
        key_array = np.array(eeg[key]).reshape(-1,)
        P = np.abs(fft(key_array))
        if paths.LABEL_SIDES[key] == 'left':
            if left is None:
                left = P
            else: left += P
        elif paths.LABEL_SIDES[key] == 'right':
            if right is None:
                right = P
            else: right += P
                
    left = np.array(left[30:-15])
    right = np.array(right[30:-15])

    score = (np.mean(left - right))**2 / (np.mean(left + right))**2
    return -np.log(score)

In [36]:
def hjorth_data(id, sub_id, offset):
    raw_eeg = pd.DataFrame(np.load(paths.TRAIN_CLEAN_10 + f'{int(id)}.npy'))
    eeg = raw_eeg.loc[(offset + 20) * 20 : (offset + 30) * 20]
    eeg.columns = paths.EEG_LABELS
    size = 3 * len(eeg.columns) + 1
    array = np.zeros(size)
    for i, col in enumerate(eeg.columns):
        signal = Hjorth(list(eeg[col]), 20)
        a, m, c = signal.amc()
        array[3 * i] = a
        array[3 * i + 1] = m
        array[3 * i + 2] = c
    array[-1] = generalized(eeg)
    np.save(f'hjorth_10/{int(id)}_{int(sub_id)}.npy', array)

In [None]:
df = pd.read_csv(paths.TRAIN_CSV)
file_data = df[['eeg_id', 'eeg_sub_id', 'eeg_label_offset_seconds']]
file_data.columns = ['id', 'sub_id', 'offset']

Parallel(n_jobs=-1)(delayed(hjorth_data)(**file_data.loc[idx]) for idx in tqdm(file_data.index))

In [2]:
from sklearn.impute import KNNImputer
from scipy.signal import butter, lfilter

def butter_lowpass_filter(data, cutoff_freq: int = 20, sampling_rate: int = 200, order: int = 4):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff_freq / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    filtered_data = lfilter(b, a, data, axis=0)
    return filtered_data

FEATURES = ['Fp1', 'T3', 'C3', 'O1', 'Fp2', 'C4', 'T4', 'O2']

def traindata(fn):
    id = fn.split('.')[0]
    eeg = pd.read_parquet(paths.TRAIN_EEGS + fn, columns=FEATURES)
    eeg = eeg.loc[:10000, :]
    if eeg.isna().sum().sum() != 0:
        for col in eeg.columns:
            imputer = KNNImputer(n_neighbors=4)
            eeg[col] = imputer.fit_transform(eeg.loc[:,[col]])
    np_eeg = eeg.to_numpy()
    np_eeg = butter_lowpass_filter(np_eeg)
    np_eeg = np_eeg[::5,:]
    np.save(f'clean_train/{id}.npy', np_eeg)
    

In [3]:
def parallel_cleaner():
    for _, _, files in os.walk("./train_eegs"):
        Parallel(n_jobs=-1)(delayed(traindata)(fn) for fn in tqdm(files))

In [7]:
from sklearn.impute import KNNImputer
import pywt

def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='db8', level=1):
    ret = np.zeros_like(x)

    for i, pos in enumerate(x.columns):
        coeff = pywt.wavedec(x[pos], wavelet, mode="per")
        sigma = (1/0.6745) * maddest(coeff[-level])

        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard')
                     for i in coeff[1:])

        ret[:,i] = pywt.waverec(coeff, wavelet, mode='per')

    return ret

FEATURES = ['Fp1', 'T3', 'C3', 'O1', 'Fp2', 'C4', 'T4', 'O2']

def denoised_data(fn):
    id = fn.split('.')[0]
    eeg = pd.read_parquet(paths.TRAIN_EEGS + fn, columns=FEATURES)
    eeg = eeg.loc[:9999, :]
    if eeg.isna().sum().sum() != 0:
        for col in eeg.columns:
            imputer = KNNImputer(n_neighbors=4)
            eeg[col] = imputer.fit_transform(eeg.loc[:, [col]])
    np_eeg = denoise(eeg)
    np_eeg = np_eeg[::5, :]
    np.save(f'denoised_train/{id}.npy', np_eeg)

In [9]:
def parallel_denoiser():
    for _, _, files in os.walk("./train_eegs"):
        Parallel(n_jobs=-1)(delayed(denoised_data)(fn) for fn in tqdm(files))

In [10]:
parallel_denoiser()

  0%|          | 0/17300 [00:00<?, ?it/s]

  4%|▎         | 612/17300 [00:09<02:52, 96.65it/s] 
100%|██████████| 17300/17300 [08:30<00:00, 33.86it/s] 


Get main 8 features from 20 in EEG and take only first 2,000 recordings.

In [4]:
FEATURES = ['Fp1', 'T3', 'C3', 'O1', 'Fp2', 'C4', 'T4', 'O2']
FEATS2OLDIDX = {paths.EEG_LABELS[i] : i for i in range(20)}
print(FEATS2OLDIDX)

def extract_main(fn):
    id = fn.split('.')[0]
    eeg = np.load(paths.TRAIN_CLEAN_5 + fn)[:2000]
    new_eeg = np.zeros((2000, 8))
    for i, feat in enumerate(FEATURES):
        new_eeg[:,i] = eeg[:,FEATS2OLDIDX[feat]]
    np.save(f'train_sk5_8/{id}.npy', new_eeg)

{'Fp1': 0, 'F3': 1, 'C3': 2, 'P3': 3, 'F7': 4, 'T3': 5, 'T5': 6, 'O1': 7, 'Fz': 8, 'Cz': 9, 'Pz': 10, 'Fp2': 11, 'F4': 12, 'C4': 13, 'P4': 14, 'F8': 15, 'T4': 16, 'T6': 17, 'O2': 18, 'EKG': 19}


In [5]:
def parallel_extract_main():
    for _, _, files in os.walk("./cleaned_train_eegs_5"):
        Parallel(n_jobs=-1)(delayed(extract_main)(fn) for fn in tqdm(files))

In [6]:
parallel_extract_main()

100%|██████████| 17300/17300 [00:28<00:00, 615.12it/s]
