In [1]:
#Cell 1: install requirements
!pip install scikit-learn
!pip install --upgrade transformers
!pip install joblib

[0m

In [2]:
# Cell 2: Imports
import pandas as pd
import numpy as np
import torch
import joblib
from torch.utils.data import TensorDataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

2025-05-05 00:21:56.044770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746404516.068844    2494 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746404516.076072    2494 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746404516.101834    2494 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746404516.101855    2494 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746404516.101857    2494 computation_placer.cc:177] computation placer alr

In [3]:
# Cell 3: data preprocessing. This was done locally to protect sensitive information, more info in report
"""
import pandas as pd
import hashlib


def anonymize_donor_names(input_csv: str, output_csv: str) -> pd.DataFrame:
    Load the transaction CSV, hash the Donor Name field using SHA-256,
    and write out a new CSV with names anonymized.

    Parameters:
    -----------
    input_csv : str
        Path to your original CSV file.
    output_csv : str
        Path where the anonymized CSV will be saved.

    Returns:
    --------
    pd.DataFrame
        The DataFrame with the 'Donor Name' column replaced by its hash.
    # 1. Load
    df = pd.read_csv(input_csv)

    # 2. Check for the column
    if "Donor Name" not in df.columns:
        raise KeyError("Could not find column 'Donor Name' in the input CSV.")

    # 3. Hash each name (empty→hash of empty string)
    salt = "hidden"
    df["Donor Name"] = (
        df["Donor Name"]
        .fillna("")
        .apply(lambda x: hashlib.sha256((salt + x).encode("utf-8")).hexdigest())
    )
    # Replace memo field for ACCT_XFER rows based on description
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer from CHK ...hidden")),
        "Memo",
    ] = "Sadaqah - Masjid Operations"
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer to CHK ...hidden")),
        "Memo",
    ] = "Ramadan"
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer to CHK ...hidden")),
        "Memo",
    ] = "Reconstruction"
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer to CHK ...hidden")),
        "Memo",
    ] = "Sadaqah - For Needy"
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer to CHK ...hidden")),
        "Memo",
    ] = "Seminary"
    df.loc[
        (df["Type"] == "ACCT_XFER")
        & (df["Description"].str.startswith("Online Transfer to CHK ...hidden")),
        "Memo",
    ] = "Zakat"
    df.drop(
        columns=[
            "Posting Date",
            "Description",
            "Balance",
            "ReferenceID",
            "Donation Date",
            "Email",
            "Source",
            "Unnamed: 13",
            "Sadaqah - Masjid Operations",
            "Sadaqah - For Needy",
            "Zakat",
            "Ramadan",
            "Reconstruction",
            "Seminary",
            "Unnamed: 20",
            "Diff",
            "Unnamed: 22",
            "Notes",
        ],
        inplace=True,
    )

    # Remove rows where 'Details' column equals 'TOTAL'
    df = df[df["Details"] != "TOTAL"]

    # Drop rows where Fund is missing or lists multiple accounts
    df = df[df["Fund"].notna()]
    df = df[~df["Fund"].str.contains(",")]
    # 4. Save anonymized CSV
    df.to_csv(output_csv, index=False)
    return df


df_anon = anonymize_donor_names(
    "ai/Miscellaneous Account Cleanup - Sheet1.csv",
    "ai/Miscellaneous Account Cleanup - Sheet1_anon.csv",
)
"""

'\nimport pandas as pd\nimport hashlib\n\n\ndef anonymize_donor_names(input_csv: str, output_csv: str) -> pd.DataFrame:\n    Load the transaction CSV, hash the Donor Name field using SHA-256,\n    and write out a new CSV with names anonymized.\n\n    Parameters:\n    -----------\n    input_csv : str\n        Path to your original CSV file.\n    output_csv : str\n        Path where the anonymized CSV will be saved.\n\n    Returns:\n    --------\n    pd.DataFrame\n        The DataFrame with the \'Donor Name\' column replaced by its hash.\n    # 1. Load\n    df = pd.read_csv(input_csv)\n\n    # 2. Check for the column\n    if "Donor Name" not in df.columns:\n        raise KeyError("Could not find column \'Donor Name\' in the input CSV.")\n\n    # 3. Hash each name (empty→hash of empty string)\n    salt = "hidden"\n    df["Donor Name"] = (\n        df["Donor Name"]\n        .fillna("")\n        .apply(lambda x: hashlib.sha256((salt + x).encode("utf-8")).hexdigest())\n    )\n    # Replace m

In [4]:
#Cell 3.5: read csv from disk 
df = pd.read_csv("Miscellaneous Account Cleanup - Sheet1_anon.csv")

In [5]:
df.head()

Unnamed: 0,Details,Amount,Type,Donor Name,Fund,Memo
0,CREDIT,1.0,ACCT_XFER,797e1bca657d149611fc11d26e6a60fd62798b6095897a...,Sadaqah - Masjid Operations,Sadaqah - Masjid Operations
1,CREDIT,11.11,PARTNERFI_TO_CHASE,7a5d7ecbb7a12468b1cbdeeeaf5127f03d2377578cd67f...,Sadaqah - For Needy,Sadaqa
2,CREDIT,2.0,PARTNERFI_TO_CHASE,d4cc3604dd422d3f342b7c2474a2d00aebff74051d3ff6...,Sadaqah - Masjid Operations,
3,CREDIT,5.0,QUICKPAY_CREDIT,61ac16b23d67abb8f0a6804ccc03da15b83e069f17b629...,Sadaqah - Masjid Operations,pakistan food relief
4,CREDIT,10.0,QUICKPAY_CREDIT,e617589ec529bc7f6b494818887779160493eb08b7102e...,Sadaqah - Masjid Operations,flood relief


In [6]:
# Cell 4: Build the text input from the five fields + amount token
df[["Details","Type","Donor Name","Memo"]] = (
    df[["Details","Type","Donor Name","Memo"]]
      .fillna("")
      .astype(str)
      .map(str.lower)
)
df["Amount"] = df["Amount"].astype(float)

df["text"] = (
    df["Details"] + " | "
  + df["Type"]    + " | "
  + df["Donor Name"] + " | "
  + df["Memo"]    + " | "
  + df["Amount"].apply(lambda x: f"<amt={x:.2f}>")
)

In [7]:
# Cell 5: Label‐encode the Fund target and stratify‐split
le = LabelEncoder().fit(df["Fund"])
df["label"] = le.transform(df["Fund"])

train_df, temp_df = train_test_split(
    df, test_size=0.30, stratify=df["label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label"], random_state=42
)

print("Sizes:", len(train_df), "train,", len(val_df), "val,", len(test_df), "test")


Sizes: 2230 train, 478 val, 478 test


In [8]:
# Cell 6: Manual tokenization → TensorDataset (avoids Arrow copy-error)


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# 1. Tokenize in one go
train_enc = tokenizer(
    train_df["text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)
val_enc = tokenizer(
    val_df["text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)
test_enc = tokenizer(
    test_df["text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

# 2. Build TensorDatasets
train_dataset = TensorDataset(
    train_enc.input_ids,
    train_enc.attention_mask,
    torch.tensor(train_df["label"].values, dtype=torch.long),
)
val_dataset = TensorDataset(
    val_enc.input_ids,
    val_enc.attention_mask,
    torch.tensor(val_df["label"].values, dtype=torch.long),
)
test_dataset = TensorDataset(
    test_enc.input_ids,
    test_enc.attention_mask,
    torch.tensor(test_df["label"].values, dtype=torch.long),
)

print(f"→ train/examples: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")


→ train/examples: 2230, val: 478, test: 478


In [9]:
#Cell 7: fine tune
num_labels = len(le.classes_)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="single_label_classification"
)

def tuple_collator(batch):
    input_ids      = torch.stack([item[0] for item in batch])
    attention_mask = torch.stack([item[1] for item in batch])
    labels         = torch.tensor([item[2] for item in batch], dtype=torch.long)
    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "labels":         labels,
    }
training_args = TrainingArguments(
    num_train_epochs=150,
    per_device_train_batch_size=16,
    learning_rate=2e-5,

    save_strategy="epoch",            
    save_total_limit=1,               
    load_best_model_at_end=True,      
    metric_for_best_model="eval_loss",
    greater_is_better=False,         

    eval_strategy="epoch",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # this is your TensorDataset
    eval_dataset=val_dataset,      # same here
    data_collator=tuple_collator,  # <- override the default
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.193515
2,No log,0.127812
3,No log,0.121715
4,0.247200,0.131757
5,0.247200,0.124959
6,0.247200,0.158625
7,0.247200,0.104948
8,0.056700,0.120336
9,0.056700,0.205323
10,0.056700,0.105623


TrainOutput(global_step=21000, training_loss=0.012422518202236721, metrics={'train_runtime': 61810.6483, 'train_samples_per_second': 5.412, 'train_steps_per_second': 0.34, 'total_flos': 2.2156752849408e+16, 'train_loss': 0.012422518202236721, 'epoch': 150.0})

In [10]:
# Cell 8: Evaluate on test set
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = test_df["label"]

print(classification_report(y_true, y_pred, target_names=le.classes_))


                             precision    recall  f1-score   support

                    Ramadan       0.98      0.98      0.98        57
             Reconstruction       0.92      0.88      0.90        50
        Sadaqah - For Needy       0.85      0.89      0.87        19
Sadaqah - Masjid Operations       0.99      0.99      0.99       326
                   Seminary       1.00      1.00      1.00         5
                      Zakat       0.91      0.95      0.93        21

                   accuracy                           0.97       478
                  macro avg       0.94      0.95      0.95       478
               weighted avg       0.97      0.97      0.97       478



In [11]:
# Cell 9: Save model
model.save_pretrained("fund_model/best/")
tokenizer.save_pretrained("fund_model/best/")
joblib.dump(le, "fund_model/best/label_encoder.pkl")

['fund_model/best/label_encoder.pkl']