âœ… Dataset Splits & JSONL Export

(Reproducible, category-stratified, experiment-ready)

We will:

keep category proportions stable

use standard 70 / 15 / 15 split

fix a random seed for reproducibility

generate CSV + JSONL outputs

In [1]:
# ðŸ“˜ Step 0 â€” Mount Google Drive & Import Libraries

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import pandas as pd
import numpy as np
import json
import hashlib
from sklearn.model_selection import train_test_split

print("Drive mounted successfully.")


Mounted at /content/drive
Drive mounted successfully.


In [2]:
# ðŸ“˜ Step 1 â€” Set Paths & Load CLEAN Benchmark Dataset

BASE_DIR = Path("/content/drive/MyDrive/FinGuardSDG")
DATA_DIR = BASE_DIR / "data"
SPLIT_DIR = DATA_DIR / "splits"

# Ensure required folders exist
DATA_DIR.mkdir(parents=True, exist_ok=True)
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH = DATA_DIR / "FinGuard_SDG_Benchmark_v1.0_clean.csv"

print("Loading dataset from:", DATA_PATH)
df = pd.read_csv(DATA_PATH)

print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist())


Loading dataset from: /content/drive/MyDrive/FinGuardSDG/data/FinGuard_SDG_Benchmark_v1.0_clean.csv
Loaded rows: 1160
Columns: ['id', 'category', 'subcategory', 'question_text', 'answer_text', 'difficulty', 'source']


In [3]:
# ðŸ“˜ Step 2 â€” Reproducibility Seed

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("Reproducibility seed set to:", RANDOM_SEED)


Reproducibility seed set to: 42


In [4]:
# ðŸ“˜ Step 3A â€” First split: Train (70%) + Temp (30%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df["category"],
    random_state=RANDOM_SEED
)

# ðŸ“˜ Step 3B â€” Second split: Temp â†’ Val (15%) + Test (15%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,    # 0.50 Ã— 30% = 15%
    stratify=temp_df["category"],
    random_state=RANDOM_SEED
)

print("Train size:", len(train_df))
print("Val size:  ", len(val_df))
print("Test size: ", len(test_df))


Train size: 812
Val size:   174
Test size:  174


In [5]:
# ðŸ“˜ Step 4 â€” Validate Stratification

print("\nTrain distribution:")
print(train_df["category"].value_counts(normalize=True))

print("\nVal distribution:")
print(val_df["category"].value_counts(normalize=True))

print("\nTest distribution:")
print(test_df["category"].value_counts(normalize=True))



Train distribution:
category
quantitative    0.379310
advisory        0.224138
conceptual      0.206897
esg             0.189655
Name: proportion, dtype: float64

Val distribution:
category
quantitative    0.379310
advisory        0.224138
conceptual      0.206897
esg             0.189655
Name: proportion, dtype: float64

Test distribution:
category
quantitative    0.379310
advisory        0.224138
conceptual      0.206897
esg             0.189655
Name: proportion, dtype: float64


In [6]:
# ðŸ“˜ Step 5 â€” Save Split CSV Files

train_path = SPLIT_DIR / "FinGuard_SDG_train.csv"
val_path   = SPLIT_DIR / "FinGuard_SDG_val.csv"
test_path  = SPLIT_DIR / "FinGuard_SDG_test.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved split files to:")
print("-", train_path.resolve())
print("-", val_path.resolve())
print("-", test_path.resolve())


Saved split files to:
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_train.csv
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_val.csv
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_test.csv


In [7]:
# ðŸ“˜ Step 6 â€” SHA256 Hashes for Reproducibility

def sha256(path):
    return hashlib.sha256(open(path, "rb").read()).hexdigest()

print("\nSHA256 Checksums:")
print("train.csv:", sha256(train_path))
print("val.csv:  ", sha256(val_path))
print("test.csv: ", sha256(test_path))



SHA256 Checksums:
train.csv: f153c387cc5fef78822807085b5c96ed514cf89670017423fa2f2e182bfa2cf6
val.csv:   a626ec2071b1b32edfa48e66ab7b61d0c4e0c7ee9cdc35e2d830eb78e090c990
test.csv:  91b4c926ca5d68e03b62ff2e4cb3ae47a6462a460b1eea8b334efcc45f2d9b4d


In [8]:
# ðŸ“˜ Step 7 â€” JSONL Export (Optional)

def to_jsonl(df, path):
    with open(path, "w") as f:
        for _, row in df.iterrows():
            f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

train_jsonl = SPLIT_DIR / "FinGuard_SDG_train.jsonl"
val_jsonl   = SPLIT_DIR / "FinGuard_SDG_val.jsonl"
test_jsonl  = SPLIT_DIR / "FinGuard_SDG_test.jsonl"

to_jsonl(train_df, train_jsonl)
to_jsonl(val_df, val_jsonl)
to_jsonl(test_df, test_jsonl)

print("\nJSONL files saved:")
print("-", train_jsonl.resolve())
print("-", val_jsonl.resolve())
print("-", test_jsonl.resolve())



JSONL files saved:
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_train.jsonl
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_val.jsonl
- /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_test.jsonl


In [9]:
# ðŸ“˜ Step 8 â€” Split Summary JSON

split_summary = {
    "seed": RANDOM_SEED,
    "train_size": len(train_df),
    "val_size": len(val_df),
    "test_size": len(test_df),
    "category_distribution_train": train_df["category"].value_counts().to_dict(),
    "category_distribution_val":   val_df["category"].value_counts().to_dict(),
    "category_distribution_test":  test_df["category"].value_counts().to_dict(),
}

summary_path = SPLIT_DIR / "FinGuard_SDG_split_summary.json"

with open(summary_path, "w") as f:
    json.dump(split_summary, f, indent=2)

print("Saved split summary to:", summary_path.resolve())


Saved split summary to: /content/drive/MyDrive/FinGuardSDG/data/splits/FinGuard_SDG_split_summary.json
