In [None]:
import pandas as pd
import glob
from IPython.display import display

df= pd.read_csv('Data/train.csv')
display(df)
df= pd.read_csv('Data/test.csv')
display(df)

In [None]:
path= 'Data/train_spectrograms/353733.parquet'

data= pd.read_parquet(path)
data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import cv2
from scipy.stats import boxcox

def norm_to_255(img):
    img= img-img.min()
    img= img/img.max()
    img= img*255
    return img.astype(np.uint8)

def norm_to_standard(img):
    ep = 1e-6
    m = np.nanmean(img.flatten())
    s = np.nanstd(img.flatten())
    img = (img-m)/(s+ep)
    img = np.nan_to_num(img, nan=0.0)
    return img

path= 'Data/train_spectrograms/353733.parquet'

data= pd.read_parquet(path)
col= data.filter(like='LL', axis=1)
col= list(col)

img= data[col].T.values
print(img.min(), img.max(), img.mean())
img= norm_to_255(img)
plt.hist(img)
plt.show()
plt.imshow(img)
plt.show()

img= np.log(data[col].T.values)
print(img.min(), img.max(), img.mean())
img= norm_to_standard(img)
plt.hist(img)
plt.show()
plt.imshow(img)
plt.show()

img= np.log(data[col].T.values)
print(img.min(), img.max(), img.mean())
plt.hist(img)
plt.show()
img= norm_to_255(img)
plt.imshow(img)
plt.show()

img= data[col].T.values
img= norm_to_255(img)
clahe= cv2.createCLAHE()
img= clahe.apply(img)
print(img.min(), img.max(), img.mean())
plt.hist(img)
plt.show()
plt.imshow(img)
plt.show()

img= data[col].T.values
img= np.sqrt(img)
print(img.min(), img.max(), img.mean())
plt.hist(img)
plt.show()
img= norm_to_255(img)
plt.imshow(img)
plt.show()

img= data[col].T.values
shape= img.shape
img, lambda_value = boxcox(img.reshape(-1))
img= img.reshape(shape)
print(img.min(), img.max(), img.mean())
plt.hist(img)
plt.show()
img= norm_to_255(img)
plt.imshow(img)
plt.show()

from scipy.special import erfinv as sp_erfinv
img= np.log(data[col].T.values)
epsilon = 1e-6
img= img.argsort().argsort()
img = (img/img.max()-0.5)*2 # scale to (-1,1)
img = np.clip(img,-1+epsilon,1-epsilon)
img = sp_erfinv(img) # map to gaussian
print(img.min(), img.max(), img.mean())
plt.hist(img)
plt.show()
img= norm_to_255(img)
plt.imshow(img)
plt.show()

# Make Train_Spectrogram csv

In [None]:
df= pd.read_csv('Data/train.csv')

cls_name= ['Seizure','LPD','GPD','LRDA','GRDA','Other']
new_df= pd.DataFrame()
for i,_id in enumerate(df['spectrogram_id'].unique().tolist()):
    temp_df= df[df['spectrogram_id']==_id].reset_index(drop=True)
    label= temp_df.loc[0,'expert_consensus']
    soft_label= temp_df.loc[0, temp_df.columns[-8:-2] ].values
    soft_label/= soft_label.sum()
    patient= temp_df.loc[0,'patient_id']
    
    new_df.loc[i,'spectrogram_id']= str(_id)
    new_df.loc[i,'image_path']= f'../Data/train_spectrograms/{_id}.parquet'
    new_df.loc[i,'expert_consensus']= label
    new_df.loc[i,'patient_id']= str(patient)
    new_df.loc[i,'label']= cls_name.index(label)
    new_df.loc[i,'soft_label']= str(soft_label.tolist())
    
new_df.to_csv('Data/train_spectrogram.csv',index=False)
new_df

In [None]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold

df= pd.read_csv('Data/train_spectrogram.csv')

kf= GroupKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(kf.split(df['patient_id'], df['patient_id'], df['patient_id'])):
    for indx in test_index:
        df.loc[indx, 'fold']= i
df.to_csv('Data/train_spectrogram.csv',index=False)
df

# Make Train_eeg csv

In [None]:
from tqdm.auto import tqdm
import pandas as pd

df= pd.read_csv('Data/train.csv')
spec_df= pd.read_csv('Data/train_spectrogram.csv')

cls_name= ['Seizure','LPD','GPD','LRDA','GRDA','Other']
new_df= pd.DataFrame()
for i,_id in enumerate(tqdm(df['eeg_id'].unique().tolist())):
    temp_df= df[df['eeg_id']==_id].reset_index(drop=True)
    label= temp_df.loc[0,'expert_consensus']
    soft_label= temp_df.loc[0, temp_df.columns[-8:-2] ].values
    voter= soft_label.sum()
    soft_label/= soft_label.sum()
    patient= temp_df.loc[0,'patient_id']
    spec_id= temp_df.loc[0,'spectrogram_id']
    
    new_df.loc[i,'eeg_id']= str(_id)
    new_df.loc[i,'spectrogram_id']= str(spec_id)
    new_df.loc[i,'image_path']= f'../Data/train_eegs/{_id}.parquet'
    new_df.loc[i,'expert_consensus']= label
    new_df.loc[i,'patient_id']= str(patient)
    new_df.loc[i,'label']= cls_name.index(label)
    new_df.loc[i,'soft_label']= str(soft_label.tolist())
    new_df.loc[i,'voter']= voter
    
    temp_df= spec_df[spec_df['spectrogram_id']==spec_id].reset_index(drop=True)
    fold= temp_df.loc[0,'fold']
    new_df.loc[i,'fold']= str(fold)
    
new_df.to_csv('Data/train_eeg.csv',index=False)
new_df

# Parquet2NPY

In [None]:
import numpy as np
import pandas as pd
from preprocessing import spectrogram_from_eeg
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

def read_data(data):
    def norm_to_255(img):
        img= img-img.min()
        img= img/img.max()
        img= img*255
        return img.astype(np.uint8)

    def norm_to_standard(img):
        ep = 1e-6
        m = np.nanmean(img.flatten())
        s = np.nanstd(img.flatten())
        img = (img-m)/(s+ep)
        img = np.nan_to_num(img, nan=0.0)
        return img

    ## train_spectrograms
    path= data['image_path'].replace('train_eegs','train_spectrograms').replace('..','.')
    path= path.replace(str(data['eeg_id']), str(data['spectrogram_id']))
    raw= pd.read_parquet(path).fillna(0)
    
    col= list(raw.filter(like='LL', axis=1))
    img_LL= np.log1p(raw[col].T.values)
    col= list(raw.filter(like='RL', axis=1))
    img_RL= np.log1p(raw[col].T.values)
    col= list(raw.filter(like='RP', axis=1))
    img_RP= np.log1p(raw[col].T.values)
    col= list(raw.filter(like='LP', axis=1))
    img_LP= np.log1p(raw[col].T.values)
    
    img= np.concatenate([img_LL, img_LP, img_RP, img_RL], axis=0)
    img= np.expand_dims(img, axis=2)
    img= np.concatenate([img, img, img], axis=2)
    img_spectrograms= norm_to_standard(img)

    ## train_eegs
    img_10= spectrogram_from_eeg(data['image_path'].replace('..','.'), duration=10, height=100)
    img_10= np.concatenate([img_10[..., 0],
                            img_10[..., 1],
                            img_10[..., 2],
                            img_10[..., 3]], axis=0)
    img_30= spectrogram_from_eeg(data['image_path'].replace('..','.'), duration=30, height=100)
    img_30= np.concatenate([img_30[..., 0],
                            img_30[..., 1],
                            img_30[..., 2],
                            img_30[..., 3]], axis=0)
    img= np.concatenate([img_10, img_30], axis=1)
    img= np.expand_dims(img, axis=2)
    img= np.concatenate([img, img, img], axis=2)
    img_eeg= img
    
    ## fuse img
    img_spectrograms= img_spectrograms[:, :, :1]
    img_eeg= img_eeg[..., :1]
    img= np.concatenate([img_eeg, img_spectrograms], axis=1)
    img= np.concatenate([img, img, img], axis=2)
    
    return img

    
df= pd.read_csv('Data/train_eeg.csv')
for i in tqdm(range(len(df))):
    data= df.loc[i]
    img= read_data(data)
    if i==0: print(img.shape)
        
    plt.imshow(img)
    plt.show()
    break

    np.save(f'Data/train_npy/{i}.npy', img)
    df.loc[i,'npy_path']= f'../Data/train_npy/{i}.npy'

# df.to_csv('Data/train_npy.csv',index=False)
df