In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import GroupKFold
from IPython.display import FileLink

# --- CONFIGURATION ---
INPUT_DIR = '/kaggle/input/hms-harmful-brain-activity-classification'
SPECTROGRAMS_DIR = os.path.join(INPUT_DIR, 'train_spectrograms')
OUTPUT_DIR = '/kaggle/working/split_data' # Temp folder

# Create temp folders
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

print("Step 1: Reading Data and Calculating Splits...")
# Read the CSV that contains the expert votes
df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))

# --- THE SAFETY CHECK (GroupKFold) ---
# This ensures a single Patient ID NEVER appears in both Train and Test
gkf = GroupKFold(n_splits=10) 
df['fold'] = -1
for fold_id, (train_idx, val_idx) in enumerate(gkf.split(df, y=df['expert_consensus'], groups=df['patient_id'])):
    df.loc[val_idx, 'fold'] = fold_id

# Assign splits: 80% Train, 10% Val, 10% Test
def assign_split(fold):
    if fold == 0: return 'test'
    if fold == 1: return 'val'
    return 'train'

df['split'] = df['fold'].apply(assign_split)

# Save the CSVs (You need these for the labels!)
df[df['split']=='train'].to_csv('train.csv', index=False)
df[df['split']=='val'].to_csv('val.csv', index=False)
df[df['split']=='test'].to_csv('test.csv', index=False)

print("Splits created. Distribution:")
print(df['split'].value_counts())

# --- COPY AND ZIP ---
print("\nStep 2: Organizing 11,000+ files (This takes about 2-3 mins)...")

# Get unique spectrogram IDs to avoid copying duplicates
unique_specs = df[['spectrogram_id', 'split']].drop_duplicates()

for idx, row in unique_specs.iterrows():
    spec_id = row['spectrogram_id']
    split = row['split']
    
    # We copy from Input (ReadOnly) to Output (Writeable)
    src = os.path.join(SPECTROGRAMS_DIR, f"{spec_id}.parquet")
    dst = os.path.join(OUTPUT_DIR, split, f"{spec_id}.parquet")
    
    if os.path.exists(src):
        shutil.copy(src, dst)

print("Files organized. Zipping them up (This takes about 2-3 mins)...")

# Create the Zips
shutil.make_archive('train_spectrograms', 'zip', os.path.join(OUTPUT_DIR, 'train'))
shutil.make_archive('val_spectrograms', 'zip', os.path.join(OUTPUT_DIR, 'val'))
shutil.make_archive('test_spectrograms', 'zip', os.path.join(OUTPUT_DIR, 'test'))

# Cleanup temp folder to save space
shutil.rmtree(OUTPUT_DIR)

print("\nDONE! Click the links below to download:")
display(FileLink(r'train_spectrograms.zip'))
display(FileLink(r'val_spectrograms.zip'))
display(FileLink(r'test_spectrograms.zip'))
display(FileLink(r'train.csv'))
display(FileLink(r'val.csv'))
display(FileLink(r'test.csv'))

Step 1: Reading Data and Calculating Splits...
Splits created. Distribution:
split
train    85440
test     10680
val      10680
Name: count, dtype: int64

Step 2: Organizing 11,000+ files (This takes about 2-3 mins)...
Files organized. Zipping them up (This takes about 2-3 mins)...

DONE! Click the links below to download:
