In [None]:
!pip install -q datasets sentence-transformers scikit-learn xgboost matplotlib seaborn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

PROJECT_ROOT = "/content/drive/MyDrive/TransactIQ"

In [None]:
DIRS = [
    "data/raw",
    "data/processed",
    "configs",
    "models",
    "notebooks",
    "src",
]


for d in DIRS:
    os.makedirs(os.path.join(PROJECT_ROOT, d), exist_ok=True)

PROJECT_ROOT

'/content/drive/MyDrive/TransactIQ'

In [None]:
from huggingface_hub import login

login(token="<YOUR_API_TOKEN>")

In [None]:
from datasets import load_dataset

dataset = load_dataset("mitulshah/transaction-categorization")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_data = dataset["train"]
len(train_data), train_data[0]

(4501043,
 {'transaction_description': 'Wage',
  'category': 'Income',
  'country': 'USA',
  'currency': 'USD'})

In [None]:
df = train_data.to_pandas()
df.head()

Unnamed: 0,transaction_description,category,country,currency
0,Wage,Income,USA,USD
1,Arby's (Contactless),Food & Dining,AUSTRALIA,AUD
2,Occupational Therapy,Healthcare & Medical,USA,USD
3,Potbelly Store Branch,Food & Dining,UK,GBP
4,Amazon - AUSTRALIA,Shopping & Retail,AUSTRALIA,AUD


In [None]:
df["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Utilities & Services,451842
Government & Legal,451108
Financial Services,450959
Income,450545
Charity & Donations,450133
Shopping & Retail,449941
Healthcare & Medical,449857
Entertainment & Recreation,449495
Transportation,449235
Food & Dining,447928


In [None]:
df.groupby(["country", "category"]).size().unstack(fill_value=0)

category,Charity & Donations,Entertainment & Recreation,Financial Services,Food & Dining,Government & Legal,Healthcare & Medical,Income,Shopping & Retail,Transportation,Utilities & Services
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AUSTRALIA,89820,90237,90377,90003,89885,89374,91072,90110,89996,90891
CANADA,89663,89771,89855,89535,89703,89730,90567,89762,89802,90268
INDIA,90203,90204,90649,89547,90656,90261,89579,90599,89764,90082
UK,90513,89779,90298,89267,90779,90428,90228,88887,89558,90178
USA,89934,89504,89780,89576,90085,90064,89099,90583,90115,90423


In [None]:
import json

category_cfg_path = f"{PROJECT_ROOT}/configs/categories.json"
category_cfg = json.load(open(category_cfg_path))

id_to_name = {c["id"]: c["name"] for c in category_cfg["categories"]}
name_to_id = {c["name"]: c["id"] for c in category_cfg["categories"]}
name_to_keywords = {c["name"]: c.get("keywords", []) for c in category_cfg["categories"]}


In [None]:
import re
import numpy as np

def clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s\.\,\-\_]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def keyword_features(text):
    text = text.lower()
    best_cat = -1
    best_hits = 0
    total_hits = 0

    for cat in category_cfg["categories"]:
        cid = cat["id"]
        hits = sum(1 for kw in cat["keywords"] if kw in text)
        total_hits += hits
        if hits > best_hits:
            best_hits = hits
            best_cat = cid

    return best_cat, best_hits, total_hits

In [None]:
df["clean_desc"] = df["transaction_description"].apply(clean)
feat = df["clean_desc"].apply(keyword_features)

df["kw_best_id"] = feat.apply(lambda x: x[0])
df["kw_hits"] = feat.apply(lambda x: x[1])
df["kw_total"] = feat.apply(lambda x: x[2])

df.head()

Unnamed: 0,transaction_description,category,country,currency,clean_desc,kw_best_id,kw_hits,kw_total
0,Wage,Income,USA,USD,wage,7,1,1
1,Arby's (Contactless),Food & Dining,AUSTRALIA,AUD,arby s contactless,-1,0,0
2,Occupational Therapy,Healthcare & Medical,USA,USD,occupational therapy,-1,0,0
3,Potbelly Store Branch,Food & Dining,UK,GBP,potbelly store branch,2,1,1
4,Amazon - AUSTRALIA,Shopping & Retail,AUSTRALIA,AUD,amazon - australia,2,1,1


In [None]:
from sklearn.model_selection import train_test_split

df["category_id"] = df["category"].map(name_to_id)

train_df, temp_df = train_test_split(
    df, test_size=0.30, stratify=df["category_id"], random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["category_id"],
    random_state=42
)

y_train = train_df["category_id"].to_numpy()
y_val   = val_df["category_id"].to_numpy()

len(train_df), len(val_df), len(test_df)

(3150730, 675156, 675157)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def embed_to_npy(texts, out_path, batch_size=512):
    """
    texts: pandas Series of strings
    out_path: path to .npy file to save final embeddings
    batch_size: batch size for encoding
    """
    if len(texts) == 0:
        raise ValueError("texts is empty – nothing to embed")

    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    # --- First Pass: get embedding dimension ---
    sample_emb = model.encode([texts.iloc[0]], show_progress_bar=False)
    sample_emb = np.asarray(sample_emb)
    if sample_emb.ndim == 1:
        dim = sample_emb.shape[0]
    else:
        dim = sample_emb.shape[1]

    N = len(texts)
    print(f"Embedding {N} texts into dim={dim}")

    # Temporary memory-mapped file
    tmp_path = out_path + ".dat"
    memmap = np.memmap(tmp_path, dtype="float32", mode="w+", shape=(N, dim))

    # --- Streaming computation ---
    idx = 0
    for i in tqdm(range(0, N, batch_size)):
        batch = texts.iloc[i:i+batch_size].tolist()

        emb = model.encode(
            batch,
            batch_size=batch_size,
            show_progress_bar=False
        )
        emb = np.asarray(emb, dtype="float32")

        memmap[idx:idx+len(emb)] = emb
        idx += len(emb)

    memmap.flush()
    del memmap  # close file

    # --- Convert memmap -> final .npy file ---
    arr = np.memmap(tmp_path, dtype="float32", mode="r", shape=(N, dim))
    np.save(out_path, arr)
    del arr

    os.remove(tmp_path)
    print(f"Saved embeddings to: {out_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
embed_to_npy(
    texts=train_df["clean_desc"],
    out_path=f"{PROJECT_ROOT}/data/processed/train_embeddings.npy"
)

embed_to_npy(
    texts=val_df["clean_desc"],
    out_path=f"{PROJECT_ROOT}/data/processed/val_embeddings.npy"
)

embed_to_npy(
    texts=test_df["clean_desc"],
    out_path=f"{PROJECT_ROOT}/data/processed/test_embeddings.npy"
)

Embedding 3150730 texts into dim=384


  0%|          | 0/6154 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/TransactIQ/data/processed/train_embeddings.npy
Embedding 675156 texts into dim=384


  0%|          | 0/1319 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/TransactIQ/data/processed/val_embeddings.npy
Embedding 675157 texts into dim=384


  0%|          | 0/1319 [00:00<?, ?it/s]

Saved embeddings to: /content/drive/MyDrive/TransactIQ/data/processed/test_embeddings.npy


In [None]:
proc_dir = os.path.join(PROJECT_ROOT, "data/processed")

train_emb_path = os.path.join(proc_dir, "train_embeddings.npy")
val_emb_path   = os.path.join(proc_dir, "val_embeddings.npy")
test_emb_path  = os.path.join(proc_dir, "test_embeddings.npy")