In [1]:
import pyarrow.parquet as pq
import os
from tqdm import tqdm

import pandas as pd
tqdm.pandas()

### Loading all data

#### Load Y

In [2]:
#Load the train dataframe
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [4]:
train_df.info()
print(f"NaNs in train metadata: {train_df.isna().sum().sum()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106800 entries, 0 to 106799
Data columns (total 15 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   eeg_id                            106800 non-null  int64  
 1   eeg_sub_id                        106800 non-null  int64  
 2   eeg_label_offset_seconds          106800 non-null  float64
 3   spectrogram_id                    106800 non-null  int64  
 4   spectrogram_sub_id                106800 non-null  int64  
 5   spectrogram_label_offset_seconds  106800 non-null  float64
 6   label_id                          106800 non-null  int64  
 7   patient_id                        106800 non-null  int64  
 8   expert_consensus                  106800 non-null  object 
 9   seizure_vote                      106800 non-null  int64  
 10  lpd_vote                          106800 non-null  int64  
 11  gpd_vote                          106800 non-null  i

In [5]:
test_df.head()

Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   spectrogram_id  1 non-null      int64
 1   eeg_id          1 non-null      int64
 2   patient_id      1 non-null      int64
dtypes: int64(3)
memory usage: 152.0 bytes


Ok so we don't have any missing values in the train and test dataframes. Test is empty since it is dummy data. Now lets load in all the eeg data and check the shape of the data.

#### Load X (EEGs)

In [7]:
# Load all the train_eegs
def load_eegs():
    eegs = []
    # Get all eeg filenames
    eeg_files = os.listdir('../data/train_eegs')
    
    #Load all eegs into eegs list
    for eeg_file in tqdm(eeg_files):
        eegs.append(pq.read_table(f'../data/train_eegs/{eeg_file}').to_pandas())
    return eegs, eeg_files

In [8]:
def save_eegs(eegs, eeg_files):
    files = [x.split('.')[0] for x in eeg_files]
    
    #For each eeg, add files as a column
    for eeg, curr_file in zip(eegs, files):
        eeg['file'] = curr_file
    
    #Concatenate all eegs into one dataframe
    eegs = pd.concat(eegs)

    #Save the eegs to a single parquet file
    eegs.to_parquet(f'../data/train_eegs.parquet')
    return eegs

In [9]:
# Check if train_eegs.parquet exists
if os.path.exists('../data/train_eegs.parquet'):
    eegs = pd.read_parquet('../data/train_eegs.parquet')
else:
    eegs, eeg_files = load_eegs()
    eegs = save_eegs(eegs, eeg_files)

In [10]:
eegs.head()

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,...,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG,file
0,-105.849998,-89.230003,-79.459999,-49.23,-99.730003,-87.769997,-53.330002,-50.740002,-32.25,-42.099998,...,-88.730003,-74.410004,-92.459999,-58.93,-75.739998,-59.470001,8.21,66.489998,1404.930054,1000913311
1,-85.470001,-75.07,-60.259998,-38.919998,-73.080002,-87.510002,-39.68,-35.630001,-76.839996,-62.740002,...,-68.629997,-61.689999,-69.32,-35.790001,-58.900002,-41.66,196.190002,230.669998,3402.669922,1000913311
2,8.84,34.849998,56.43,67.970001,48.099998,25.35,80.25,48.060001,6.72,37.880001,...,16.58,55.060001,45.02,70.529999,47.82,72.029999,-67.18,-171.309998,-3565.800049,1000913311
3,-56.32,-37.279999,-28.1,-2.82,-43.43,-35.049999,3.91,-12.66,8.65,3.83,...,-51.900002,-21.889999,-41.330002,-11.58,-27.040001,-11.73,-91.0,-81.190002,-1280.930054,1000913311
4,-110.139999,-104.519997,-96.879997,-70.25,-111.660004,-114.43,-71.830002,-61.919998,-76.150002,-79.779999,...,-99.029999,-93.610001,-104.410004,-70.07,-89.25,-77.260002,155.729996,264.850006,4325.370117,1000913311


In [11]:
eegs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280905200 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column  Dtype  
---  ------  -----  
 0   Fp1     float32
 1   F3      float32
 2   C3      float32
 3   P3      float32
 4   F7      float32
 5   T3      float32
 6   T5      float32
 7   O1      float32
 8   Fz      float32
 9   Cz      float32
 10  Pz      float32
 11  Fp2     float32
 12  F4      float32
 13  C4      float32
 14  P4      float32
 15  F8      float32
 16  T4      float32
 17  T6      float32
 18  O2      float32
 19  EKG     float32
 20  file    object 
dtypes: float32(20), object(1)
memory usage: 25.1+ GB


#### Load X (Spectrograms)

In [12]:
# Load all the train_spectrograms
def load_spectrograms():
    spectrograms = []
    # Get all spectrogram filenames
    spectrogram_files = os.listdir('../data/train_spectrograms')
    
    #Load all spectrograms into spectrograms list
    for spectrogram_file in tqdm(spectrogram_files):
        spectrograms.append(pq.read_table(f'../data/train_spectrograms/{spectrogram_file}').to_pandas())
    return spectrograms, spectrogram_files

In [13]:
def save_spectrograms(spectrograms, spectrogram_files):
    files = [x.split('.')[0] for x in spectrogram_files]
    
    #For each spectrogram, add files as a column
    for spectrogram, curr_file in zip(spectrograms, files):
        spectrogram['file'] = curr_file
    
    #Concatenate all spectrograms into one dataframe
    spectrograms = pd.concat(spectrograms)

    #Save the spectrograms to a single parquet file
    spectrograms.to_parquet(f'../data/train_spectrograms.parquet')
    return spectrograms

In [14]:
# Check if train_spectrograms.parquet exists
if os.path.exists('../data/train_spectrograms.parquet'):
    spectrograms = pd.read_parquet('../data/train_spectrograms.parquet')
else:
    spectrograms, spectrogram_files = load_spectrograms()
    spectrograms = save_spectrograms(spectrograms, spectrogram_files)

In [15]:
spectrograms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4279506 entries, 0 to 315
Columns: 402 entries, time to file
dtypes: float32(400), int64(1), object(1)
memory usage: 6.5+ GB
