In [1]:
# Notebook 3: Split & Save JSON
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from datetime import datetime
import os

In [2]:
# Load
train_df = pd.read_pickle('train_processed.pkl')
test_df = pd.read_pickle('test_processed.pkl')

# Create train/val split
train_df['strat_key'] = train_df['domain'].astype(str) + '_' + train_df['sql_complexity'].astype(str)
valid_keys = train_df['strat_key'].value_counts()
valid_keys = valid_keys[valid_keys >= 2].index
train_df = train_df[train_df['strat_key'].isin(valid_keys)]

train_final, val_final = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df['strat_key'],
    random_state=42
)

train_final = train_final.drop(columns=['strat_key'])
val_final = val_final.drop(columns=['strat_key'])

print(f"Splits - Train: {len(train_final)}, Val: {len(val_final)}, Test: {len(test_df)}")

Splits - Train: 89991, Val: 10000, Test: 5851


In [3]:
# Save as JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs('data/processed', exist_ok=True)

datasets = {
    'train': train_final.to_dict('records'),
    'val': val_final.to_dict('records'),
    'test': test_df.to_dict('records')
}

for name, data in datasets.items():
    filepath = f'data/processed/{name}_{timestamp}.json'
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)
        print(f"Saved {name}: {len(data)} samples")

# Report
print(f"\n{'='*50}")
print(f"DATA PIPELINE COMPLETE")
print(f"{'='*50}")
print(f"Timestamp: {timestamp}")
print(f"Train: {len(datasets['train']):,} samples")
print(f"Val: {len(datasets['val']):,} samples")
print(f"Test: {len(datasets['test']):,} samples")
print(f"JSON files saved with timestamp: {timestamp}")

Saved train: 89991 samples
Saved val: 10000 samples
Saved test: 5851 samples

DATA PIPELINE COMPLETE
Timestamp: 20251024_171130
Train: 89,991 samples
Val: 10,000 samples
Test: 5,851 samples
JSON files saved with timestamp: 20251024_171130
