In [None]:
from huggingface_hub import login

login(token="<YOUR_API_TOKEN>")

In [None]:
# ============================================================
# TransactIQ Hybrid Classifier - Memory Efficient GPU Version
# ============================================================
!pip install -q datasets sentence-transformers xgboost==2.0.3 scikit-learn huggingface_hub

import os
import json
import numpy as np
from tqdm.auto import tqdm

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import HfApi

# ---------------- CONFIG ----------------
HF_DATASET_ID = "sreesharvesh/transactiq-enriched"
TEXT_COL      = "transaction_description"
CAT_COL       = "category"

ENCODER_NAME  = "sentence-transformers/all-MiniLM-L6-v2"

# Train on at most this many samples (to avoid RAM blowups)
TRAIN_MAX_SAMPLES = 4_000_000   # you can try 1.5M if you have high RAM

# Batch sizes
ENC_BATCH_SIZE_TRAIN = 4096     # for training embeddings
ENC_BATCH_SIZE_EVAL  = 4096     # for eval-time chunking

DRIVE_ROOT       = "/content/drive/MyDrive/transactiq"
CONFIG_PATH      = os.path.join(DRIVE_ROOT, "config", "categories.json")
ARTIFACTS_ROOT   = os.path.join(DRIVE_ROOT, "artifacts", "model_v1_mem_efficient")
os.makedirs(ARTIFACTS_ROOT, exist_ok=True)

# Optional: push artifacts to HF Hub as a model repo
PUSH_TO_HF    = True
HF_MODEL_REPO = "sreesharvesh/transactiq-hybrid"
# ---------------------------------------


# ============================================================
# 1. Load dataset
# ============================================================
print("Loading dataset:", HF_DATASET_ID)
ds = load_dataset(HF_DATASET_ID, split="train")
print(ds)
print("Columns:", ds.column_names)

num_rows = len(ds)
print("Total rows:", num_rows)

# ============================================================
# 2. Stratified train/val/test split using sklearn
# ============================================================
all_labels_str = np.array(ds[CAT_COL])
all_indices    = np.arange(num_rows)

# First: train vs temp (80/20)
train_idx, temp_idx, y_train_str, y_temp_str = train_test_split(
    all_indices,
    all_labels_str,
    test_size=0.2,
    random_state=42,
    stratify=all_labels_str
)

# Second: val vs test (each 10% of total)
val_idx, test_idx, y_val_str_all, y_test_str_all = train_test_split(
    temp_idx,
    y_temp_str,
    test_size=0.5,
    random_state=42,
    stratify=y_temp_str
)

ds_train_full = ds.select(train_idx.tolist())
ds_val_full   = ds.select(val_idx.tolist())
ds_test_full  = ds.select(test_idx.tolist())

print("Train size (full):", len(ds_train_full))
print("Val size:", len(ds_val_full))
print("Test size:", len(ds_test_full))

# Optional subsample train for memory / speed
if TRAIN_MAX_SAMPLES is not None and len(ds_train_full) > TRAIN_MAX_SAMPLES:
    ds_train = ds_train_full.shuffle(seed=42).select(range(TRAIN_MAX_SAMPLES))
    print(f"Subsampled train to {len(ds_train)} rows")
else:
    ds_train = ds_train_full

ds_val  = ds_val_full   # keep full val/test but we will process them in chunks
ds_test = ds_test_full

# ============================================================
# 3. Label encoding
# ============================================================
categories = sorted(list(set(ds[CAT_COL])))
label2id = {c: i for i, c in enumerate(categories)}
id2label = {i: c for c, i in label2id.items()}
num_classes = len(categories)
print("Label mapping:", label2id)

def encode_labels(ds_split):
    return np.array([label2id[c] for c in ds_split[CAT_COL]], dtype=np.int32)

y_train = encode_labels(ds_train)
y_val   = encode_labels(ds_val)
y_test  = encode_labels(ds_test)

# ============================================================
# 4. Encoder on GPU + embedding helper (batched)
# ============================================================
print("Loading encoder on GPU:", ENCODER_NAME)
encoder = SentenceTransformer(ENCODER_NAME).to("cuda")

def compute_embeddings_for_split(ds_split, text_col, batch_size):
    """
    Encode an entire HF split into embeddings in batches.
    Used for TRAIN ONLY (we want full matrix for XGBoost).
    """
    all_embs = []
    n = len(ds_split)
    for start in tqdm(range(0, n, batch_size), desc=f"Encoding {text_col} (TRAIN)"):
        end = min(start + batch_size, n)
        texts = ds_split[text_col][start:end]
        embs = encoder.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=False,
            device="cuda"
        )
        all_embs.append(embs.astype(np.float32))
    return np.vstack(all_embs)

# ============================================================
# 5. Numeric/context features
# ============================================================
NUMERIC_COLS = ["amount", "log_amount", "year", "month", "day_of_week", "is_weekend"]

def extract_numeric(ds_split):
    mats = []
    for col in NUMERIC_COLS:
        arr = np.array(ds_split[col], dtype=np.float32).reshape(-1, 1)
        mats.append(arr)
    return np.hstack(mats)

# ============================================================
# 6. Build TRAIN features and train XGBoost on GPU
# ============================================================
print("\nBuilding TRAIN features (embeddings + numeric)...")
X_train_text = compute_embeddings_for_split(ds_train, TEXT_COL, batch_size=ENC_BATCH_SIZE_TRAIN)
X_train_num  = extract_numeric(ds_train)
X_train      = np.hstack([X_train_text, X_train_num])

del X_train_text, X_train_num

# For early stopping, use a SMALL validation set (e.g. 100k rows max)
VAL_ES_MAX = min(100_000, len(ds_val))
ds_val_es  = ds_val.select(range(VAL_ES_MAX))
y_val_es   = y_val[:VAL_ES_MAX]

print("\nBuilding VAL(ES) features...")
X_val_es_text = compute_embeddings_for_split(ds_val_es, TEXT_COL, batch_size=ENC_BATCH_SIZE_TRAIN)
X_val_es_num  = extract_numeric(ds_val_es)
X_val_es      = np.hstack([X_val_es_text, X_val_es_num])

del X_val_es_text, X_val_es_num

print("Shapes -> X_train:", X_train.shape, "y_train:", y_train.shape)
print("           X_val_es:", X_val_es.shape, "y_val_es:", y_val_es.shape)

# XGBoost GPU training
dtrain = xgb.DMatrix(X_train, label=y_train)
dval_es = xgb.DMatrix(X_val_es, label=y_val_es)

params = {
    "objective": "multi:softprob",
    "num_class": num_classes,
    "eval_metric": "mlogloss",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "max_depth": 8,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "max_bin": 256,
}

evals = [(dtrain, "train"), (dval_es, "val_es")]
print("\nTraining XGBoost on GPU...")
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=600,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=50
)
print("Best iteration:", bst.best_iteration)

# Free big training arrays to reclaim RAM
del X_train, dtrain, X_val_es, dval_es
import gc; gc.collect()

# ============================================================
# 7. Evaluation helpers (no big matrices stored)
# ============================================================
def evaluate_predictions(y_true, y_pred_ids, name="model"):
    print(f"\n=== Evaluation: {name} ===")
    print(classification_report(
        y_true,
        y_pred_ids,
        target_names=[id2label[i] for i in range(num_classes)],
        digits=4
    ))
    macro_f1 = f1_score(y_true, y_pred_ids, average="macro")
    print(f"{name} macro F1:", macro_f1)
    cm = confusion_matrix(y_true, y_pred_ids)
    print(f"{name} Confusion matrix shape:", cm.shape)
    return {"macro_f1": float(macro_f1), "confusion_matrix": cm.tolist()}

# ============================================================
# 8. Prototype head from categories.json + train data
# ============================================================
print("\nLoading categories.json from:", CONFIG_PATH)
with open(CONFIG_PATH, "r") as f:
    cats_cfg = json.load(f)

base_categories_cfg = cats_cfg["categories"]
cfg_name_to_id = {c["name"]: label2id.get(c["name"], None) for c in base_categories_cfg}
print("Config categories mapped to labels:", cfg_name_to_id)

MAX_REAL_EXAMPLES_PER_CAT = 500

def build_category_prototype(cat_cfg, ds_train, text_col):
    name = cat_cfg["name"]
    keywords = cat_cfg.get("keywords", [])
    texts = list(keywords)

    if name in label2id:
        label_id = label2id[name]
        # indices where this category appears in train
        indices = [i for i, y in enumerate(y_train) if y == label_id]
        indices = indices[:MAX_REAL_EXAMPLES_PER_CAT]
        real_texts = [ds_train[text_col][i] for i in indices]
        texts.extend(real_texts)

    texts = list(dict.fromkeys(texts))
    if not texts:
        return None

    embs = encoder.encode(
        texts,
        batch_size=min(len(texts), 1024),
        convert_to_numpy=True,
        show_progress_bar=False,
        device="cuda"
    )
    embs = embs.astype(np.float32)
    return embs.mean(axis=0)

prototypes = {}
for cat_cfg_entry in tqdm(base_categories_cfg, desc="Building prototypes"):
    name = cat_cfg_entry["name"]
    proto = build_category_prototype(cat_cfg_entry, ds_train, TEXT_COL)
    if proto is not None:
        prototypes[name] = proto

print("Built prototypes for:", list(prototypes.keys()))

proto_labels = list(prototypes.keys())
proto_matrix = np.stack([prototypes[n] for n in proto_labels], axis=0)
proto_matrix = normalize(proto_matrix)
proto_name_to_idx = {name: i for i, name in enumerate(proto_labels)}

def predict_prototype(emb, top_k=3):
    e = emb / np.linalg.norm(emb, ord=2)
    sims = proto_matrix @ e
    idx = np.argsort(-sims)[:top_k]
    return [(proto_labels[i], float(sims[i])) for i in idx]

# ============================================================
# 9. Chunked evaluation (XGBoost, prototype, hybrid) for any split
# ============================================================
def extract_numeric_chunk(ds_split, start, end):
    mats = []
    for col in NUMERIC_COLS:
        arr = np.array(ds_split[col][start:end], dtype=np.float32).reshape(-1, 1)
        mats.append(arr)
    return np.hstack(mats)

def eval_split_chunked(ds_split, y_true, name="split"):
    """
    Evaluate:
      - XGBoost-only
      - Prototype-only
      - Hybrid
    on a split using CHUNKS to avoid large RAM usage.
    """
    n = len(ds_split)
    xgb_preds   = []
    proto_preds = []
    hybrid_preds = []

    for start in tqdm(range(0, n, ENC_BATCH_SIZE_EVAL), desc=f"Evaluating {name}"):
        end = min(start + ENC_BATCH_SIZE_EVAL, n)
        texts = ds_split[TEXT_COL][start:end]

        # 1) Encode texts -> embeddings (GPU)
        embs = encoder.encode(
            texts,
            batch_size=ENC_BATCH_SIZE_EVAL,
            convert_to_numpy=True,
            show_progress_bar=False,
            device="cuda"
        ).astype(np.float32)

        # 2) Numeric features
        num = extract_numeric_chunk(ds_split, start, end)

        # 3) XGBoost predictions
        X_chunk = np.hstack([embs, num])
        dchunk  = xgb.DMatrix(X_chunk)
        proba   = bst.predict(dchunk)
        xgb_pred_ids = np.argmax(proba, axis=1)

        # 4) Prototype-only + hybrid
        for i in range(embs.shape[0]):
            emb_i = embs[i]
            xgb_id = int(xgb_pred_ids[i])
            xgb_name = id2label[xgb_id]
            xgb_conf = float(proba[i, xgb_id])

            # prototype
            proto_top_name, proto_conf = predict_prototype(emb_i, top_k=1)[0]
            proto_id = label2id.get(proto_top_name, 0)

            # hybrid routing logic
            if proto_top_name not in label2id:
                final_cat = proto_top_name
            else:
                if (xgb_conf > 0.8) and (xgb_name == proto_top_name):
                    final_cat = xgb_name
                elif proto_conf > 0.9:
                    final_cat = proto_top_name
                else:
                    final_cat = xgb_name

            final_id = label2id.get(final_cat, 0)

            xgb_preds.append(xgb_id)
            proto_preds.append(proto_id)
            hybrid_preds.append(final_id)

        # free chunk arrays
        del embs, num, X_chunk, dchunk, proba
        import gc; gc.collect()

    xgb_preds    = np.array(xgb_preds, dtype=np.int32)
    proto_preds  = np.array(proto_preds, dtype=np.int32)
    hybrid_preds = np.array(hybrid_preds, dtype=np.int32)

    metrics = {}
    metrics["xgb"]    = evaluate_predictions(y_true, xgb_preds,    name=f"XGBoost ({name})")
    metrics["proto"]  = evaluate_predictions(y_true, proto_preds,  name=f"Prototype-only ({name})")
    metrics["hybrid"] = evaluate_predictions(y_true, hybrid_preds, name=f"Hybrid ({name})")
    return metrics

print("\nEvaluating on VAL with chunked pipeline...")
metrics_val = eval_split_chunked(ds_val, y_val, name="val")

print("\nEvaluating on TEST with chunked pipeline...")
metrics_test = eval_split_chunked(ds_test, y_test, name="test")

# ============================================================
# 10. Save artifacts
# ============================================================
print("\nSaving artifacts to:", ARTIFACTS_ROOT)

# XGBoost model
xgb_model_path = os.path.join(ARTIFACTS_ROOT, "xgboost_head.json")
bst.save_model(xgb_model_path)

# Label mappings
with open(os.path.join(ARTIFACTS_ROOT, "label_mappings.json"), "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f, indent=2)

# Prototypes
np.savez(
    os.path.join(ARTIFACTS_ROOT, "prototypes.npz"),
    proto_matrix=proto_matrix,
    proto_labels=np.array(proto_labels, dtype=object)
)

# Metrics
all_metrics = {
    "val": metrics_val,
    "test": metrics_test,
}
with open(os.path.join(ARTIFACTS_ROOT, "metrics.json"), "w") as f:
    json.dump(all_metrics, f, indent=2)

# Copy categories.json for reference
os.system(f"cp {CONFIG_PATH} {os.path.join(ARTIFACTS_ROOT, 'categories.json')}")

print("Artifacts saved to:", ARTIFACTS_ROOT)

# Optional: push artifacts to HF Hub
if PUSH_TO_HF:
    api = HfApi()
    api.create_repo(repo_id=HF_MODEL_REPO, repo_type="model", exist_ok=True)
    print("Uploading artifacts folder to HF Hub:", HF_MODEL_REPO)
    api.upload_folder(
        repo_id=HF_MODEL_REPO,
        folder_path=ARTIFACTS_ROOT,
        repo_type="model"
    )
    print("Upload complete.")
else:
    print("Skipping HF Hub upload (set PUSH_TO_HF=True to enable).")

print("\nDone ✅")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Loading dataset: sreesharvesh/transactiq-enriched


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/756 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/93.7M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/93.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4501043 [00:00<?, ? examples/s]

Dataset({
    features: ['transaction_description', 'category', 'country', 'currency', 'amount', 'date', 'time', 'log_amount', 'year', 'month', 'day_of_week', 'is_weekend'],
    num_rows: 4501043
})
Columns: ['transaction_description', 'category', 'country', 'currency', 'amount', 'date', 'time', 'log_amount', 'year', 'month', 'day_of_week', 'is_weekend']
Total rows: 4501043
Train size (full): 3600834
Val size: 450104
Test size: 450105
Label mapping: {'Charity & Donations': 0, 'Entertainment & Recreation': 1, 'Financial Services': 2, 'Food & Dining': 3, 'Government & Legal': 4, 'Healthcare & Medical': 5, 'Income': 6, 'Shopping & Retail': 7, 'Transportation': 8, 'Utilities & Services': 9}
Loading encoder on GPU: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Building TRAIN features (embeddings + numeric)...


Encoding transaction_description (TRAIN):   0%|          | 0/880 [00:00<?, ?it/s]


Building VAL(ES) features...


Encoding transaction_description (TRAIN):   0%|          | 0/25 [00:00<?, ?it/s]

Shapes -> X_train: (3600834, 390) y_train: (3600834,)
           X_val_es: (100000, 390) y_val_es: (100000,)

Training XGBoost on GPU...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



[0]	train-mlogloss:1.99084	val_es-mlogloss:1.99151
[50]	train-mlogloss:0.12217	val_es-mlogloss:0.12795
[100]	train-mlogloss:0.03251	val_es-mlogloss:0.03693
[150]	train-mlogloss:0.01644	val_es-mlogloss:0.02009
[200]	train-mlogloss:0.01186	val_es-mlogloss:0.01545
[250]	train-mlogloss:0.00985	val_es-mlogloss:0.01380
[300]	train-mlogloss:0.00863	val_es-mlogloss:0.01309
[350]	train-mlogloss:0.00775	val_es-mlogloss:0.01281
[400]	train-mlogloss:0.00707	val_es-mlogloss:0.01274
[427]	train-mlogloss:0.00675	val_es-mlogloss:0.01279
Best iteration: 377

Loading categories.json from: /content/drive/MyDrive/transactiq/config/categories.json
Config categories mapped to labels: {'Food & Dining': 3, 'Transportation': 8, 'Shopping & Retail': 7, 'Entertainment & Recreation': 1, 'Healthcare & Medical': 5, 'Utilities & Services': 9, 'Financial Services': 2, 'Income': 6, 'Government & Legal': 4, 'Charity & Donations': 0}


Building prototypes:   0%|          | 0/10 [00:00<?, ?it/s]

Built prototypes for: ['Food & Dining', 'Transportation', 'Shopping & Retail', 'Entertainment & Recreation', 'Healthcare & Medical', 'Utilities & Services', 'Financial Services', 'Income', 'Government & Legal', 'Charity & Donations']

Evaluating on VAL with chunked pipeline...


Evaluating val:   0%|          | 0/110 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"




=== Evaluation: XGBoost (val) ===
                            precision    recall  f1-score   support

       Charity & Donations     1.0000    1.0000    1.0000     45013
Entertainment & Recreation     1.0000    1.0000    1.0000     44950
        Financial Services     0.9998    1.0000    0.9999     45096
             Food & Dining     0.9956    0.9986    0.9971     44793
        Government & Legal     0.9955    0.9935    0.9945     45111
      Healthcare & Medical     0.9860    0.9746    0.9802     44985
                    Income     0.9997    0.9983    0.9990     45054
         Shopping & Retail     0.9675    0.9790    0.9732     44994
            Transportation     0.9984    0.9997    0.9991     44924
      Utilities & Services     0.9996    0.9984    0.9990     45184

                  accuracy                         0.9942    450104
                 macro avg     0.9942    0.9942    0.9942    450104
              weighted avg     0.9942    0.9942    0.9942    450104

XGBoost (v

Evaluating test:   0%|          | 0/110 [00:00<?, ?it/s]


=== Evaluation: XGBoost (test) ===
                            precision    recall  f1-score   support

       Charity & Donations     1.0000    1.0000    1.0000     45014
Entertainment & Recreation     1.0000    1.0000    1.0000     44949
        Financial Services     0.9998    0.9999    0.9999     45096
             Food & Dining     0.9959    0.9982    0.9971     44793
        Government & Legal     0.9955    0.9930    0.9942     45111
      Healthcare & Medical     0.9856    0.9753    0.9804     44986
                    Income     0.9998    0.9983    0.9990     45055
         Shopping & Retail     0.9672    0.9787    0.9729     44994
            Transportation     0.9984    0.9996    0.9990     44923
      Utilities & Services     0.9996    0.9985    0.9990     45184

                  accuracy                         0.9942    450105
                 macro avg     0.9942    0.9941    0.9942    450105
              weighted avg     0.9942    0.9942    0.9942    450105

XGBoost (

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ficient/xgboost_head.json:   1%|1         |  936kB / 84.5MB            

  ..._efficient/prototypes.npz:   5%|5         |   867B / 16.2kB            

Upload complete.

Done ✅
