### Imports

In [21]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Constants

In [22]:
base_dir = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/data_splits")

target = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/target.csv")
X_5pct_file = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/X_5pct.csv")
X_15pct_file = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/X_15pct.csv")
X_25pct_file = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/X_25pct.csv")

### Load Data

In [23]:
X_5pct = pd.read_csv(X_5pct_file, index_col = 0)
X_15pct = pd.read_csv(X_15pct_file, index_col = 0)
X_25pct = pd.read_csv(X_25pct_file, index_col = 0)
y = pd.read_csv(target, index_col = 0)
y_labels = y['severity']

In [24]:
# Define feature sets
anova_sets = {
    "5pct": X_5pct,
    "15pct": X_15pct,
    "25pct": X_25pct
}
splits = {
    "70": 0.3,
    "80": 0.2
}

### Split into train & test

In [25]:
# Loop over combinations
for set_name, X in anova_sets.items():
    for split_name, test_size in splits.items():
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_labels, test_size=test_size, stratify=y_labels, random_state=42
        )

        # Standardize
        scaler = StandardScaler().fit(X_train)
        X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X.columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns, index=X_test.index)

        # Output directory
        out_dir = base_dir / f"{set_name}_{split_name}"
        out_dir.mkdir(parents=True, exist_ok=True)

        # Save to CSV
        X_train_scaled.to_csv(out_dir / "X_train.csv")
        X_test_scaled.to_csv(out_dir / "X_test.csv")
        y_train.to_csv(out_dir / "y_train.csv", index=True)
        y_test.to_csv(out_dir / "y_test.csv", index=True)