In [None]:
from huggingface_hub import login

login(token="<YOUR_API_TOKEN>")

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os

# ----------------------
# CONFIG
# ----------------------
HF_REPO = "sreesharvesh/transactiq-enriched"
SAVE_DIR = "/content/artifacts"   # For Colab
os.makedirs(SAVE_DIR, exist_ok=True)

VAL_ROWS = 3000      # number of rows for validation bias report
TEST_ROWS = 1500     # number of rows for streaming
RANDOM_SEED = 42

# ----------------------
# 1. Load dataset
# ----------------------
print("Loading dataset...")
ds = load_dataset(HF_REPO)

# Ensure train exists
train = ds["train"]
print(f"Dataset size: {len(train):,}")

# Convert to pandas (streaming-safe)
print("Converting to pandas DataFrame...")
df = train.to_pandas()

# ----------------------
# 2. Shuffle once
# ----------------------
print("Shuffling...")
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# ----------------------
# 3. Clean column names
# Required for backend code:
#   transaction_description
#   amount
#   date
#   merchant
#   country
#   category
# ----------------------

print("Normalizing columns...")

mapping = {
    "description": "transaction_description",
    "merchant_name": "merchant",
    "merchant": "merchant",
}

df = df.rename(columns=mapping)

# If some columns missing, create them
if "amount" not in df.columns:
    df["amount"] = np.random.uniform(50, 5000, size=len(df)).round(2)

if "date" not in df.columns:
    df["date"] = pd.date_range("2024-01-01", periods=len(df)).astype(str)

if "merchant" not in df.columns:
    df["merchant"] = df["transaction_description"].str.split().str[0]

if "country" not in df.columns:
    possible = ["IN", "US", "UK", "SG", "AE"]
    df["country"] = np.random.choice(possible, size=len(df))

# Final mandatory field
df["transaction_description"] = df["transaction_description"].astype(str)

print("Final columns:", df.columns.tolist())

# ----------------------
# 4. Extract validation slice
# ----------------------
val_df = df.head(VAL_ROWS).copy()
val_path = os.path.join(SAVE_DIR, "val_sample.parquet")
val_df.to_parquet(val_path, index=False)
print(f"Saved validation sample → {val_path}")

# ----------------------
# 5. Extract test slice
# ----------------------
test_df = df.tail(TEST_ROWS).copy()
test_path = os.path.join(SAVE_DIR, "test_sample.parquet")
test_df.to_parquet(test_path, index=False)
print(f"Saved test sample → {test_path}")

print("DONE ✔")


Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/756 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/93.7M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/93.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4501043 [00:00<?, ? examples/s]

Dataset size: 4,501,043
Converting to pandas DataFrame...
Shuffling...
Normalizing columns...
Final columns: ['transaction_description', 'category', 'country', 'currency', 'amount', 'date', 'time', 'log_amount', 'year', 'month', 'day_of_week', 'is_weekend', 'merchant']
Saved validation sample → /content/artifacts/val_sample.parquet
Saved test sample → /content/artifacts/test_sample.parquet
DONE ✔
