In [6]:
# Optional installs (uncomment if needed)
# !pip install -q pandas numpy scikit-learn lightgbm optuna joblib imbalanced-learn

import os, time, warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype


from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve,
    auc,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
import lightgbm as lgb
import optuna, joblib
from imblearn.over_sampling import RandomOverSampler

# ===========================
# USER CONFIG
# ===========================

DATA_PATH = r"C:/Users/mannan/Desktop/ML_final/HI-Small_Trans.csv"

ID_COL = None
TARGET_COL = "Is Laundering"

CATEGORICAL_COLS = [
    "Account",
    "Account.1",
    "Receiving Currency",
    "Payment Currency",
    "Payment Format"
]

NUMERIC_COLS = [
    "From Bank",
    "To Bank",
    "Amount Received",
    "Amount Paid"
]

OUTPUT_DIR = "./output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CHUNK_SIZE = 200_000
KEEP_NEGATIVES = 70_000
RANDOM_STATE = 42

N_TRIALS = 8
N_FOLDS = 2
N_JOBS = 1

BASE_LGB_PARAMS = {
    'objective': 'binary',
    'metric': ['auc', 'average_precision'],
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_bin': 255,
    'min_data_in_leaf': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbosity': -1,
    'num_threads': max(1, (os.cpu_count() or 2) - 1)
}

MAX_ONEHOT_CARDINALITY = 100


# ============================================
# 0. FEATURE ENGINEERING (UPDATED)
# ============================================
def add_feature_engineering(df):
    """
    Add AML-style behavioral and interaction features.
    """
    df = df.copy()

    # ---------- Amount interaction features ----------
    if {"Amount Received", "Amount Paid"}.issubset(df.columns):
        df["amt_diff"] = df["Amount Received"] - df["Amount Paid"]
        df["amt_sum"] = df["Amount Received"] + df["Amount Paid"]

        denom = df["Amount Paid"].replace(0, np.nan)
        df["amt_ratio"] = df["Amount Received"] / denom
        df["amt_ratio"] = df["amt_ratio"].replace([np.inf, -np.inf], np.nan)

        df["log_amt_received"] = np.log1p(df["Amount Received"].clip(lower=0))
        df["log_amt_paid"] = np.log1p(df["Amount Paid"].clip(lower=0))

    # ---------- Relationship flags ----------
    # Same currency? (cast to string to avoid categorical comparison error)
    if {"Receiving Currency", "Payment Currency"}.issubset(df.columns):
        rc = df["Receiving Currency"].astype(str)
        pc = df["Payment Currency"].astype(str)
        df["same_currency"] = (rc == pc).astype(np.int8)

    # Same bank?
    if {"From Bank", "To Bank"}.issubset(df.columns):
        df["same_bank"] = (df["From Bank"] == df["To Bank"]).astype(np.int8)

    # Same account? (cast to string to avoid categorical comparison error)
    if {"Account", "Account.1"}.issubset(df.columns):
        acc1 = df["Account"].astype(str)
        acc2 = df["Account.1"].astype(str)
        df["same_account"] = (acc1 == acc2).astype(np.int8)

    # ---------- Global amount z-scores ----------
    for col in ["Amount Received", "Amount Paid"]:
        if col in df.columns:
            mean = df[col].mean()
            std = df[col].std()
            if std > 0:
                df[f"{col}_z"] = (df[col] - mean) / std
            else:
                df[f"{col}_z"] = 0.0

    # ---------- Account-level behavioral stats ----------
    if "Account" in df.columns and "Amount Paid" in df.columns:
        grp_sender = df.groupby("Account")["Amount Paid"]
        df["acc_tx_count"] = grp_sender.transform("count").astype(np.int32)
        df["acc_amt_paid_sum"] = grp_sender.transform("sum")
        df["acc_amt_paid_mean"] = grp_sender.transform("mean")
        df["acc_amt_paid_std"] = grp_sender.transform("std").fillna(0)

    if "Account.1" in df.columns and "Amount Received" in df.columns:
        grp_rcv = df.groupby("Account.1")["Amount Received"]
        df["rcv_tx_count"] = grp_rcv.transform("count").astype(np.int32)
        df["rcv_amt_received_sum"] = grp_rcv.transform("sum")
        df["rcv_amt_received_mean"] = grp_rcv.transform("mean")
        df["rcv_amt_received_std"] = grp_rcv.transform("std").fillna(0)

    return df


# ============================================
# 1. READ + DOWNSAMPLE
# ============================================
def read_and_downsample(
    data_path,
    chunk_size=CHUNK_SIZE,
    keep_negatives=KEEP_NEGATIVES,
    target_col=TARGET_COL,
    random_state=RANDOM_STATE
):
    dfs = []
    pos_total, neg_total = 0, 0

    for chunk_idx, chunk in enumerate(pd.read_csv(data_path, chunksize=chunk_size)):
        print(f"Processing chunk {chunk_idx+1}...")

        if target_col not in chunk.columns:
            raise KeyError(
                f"Target column '{target_col}' not found in CSV. "
                f"Available columns: {list(chunk.columns)}"
            )

        chunk = chunk.dropna(subset=[target_col])

        pos = chunk[chunk[target_col] == 1]
        neg = chunk[chunk[target_col] == 0]

        pos_total += len(pos)
        neg_total += len(neg)

        if len(neg) > keep_negatives:
            neg = neg.sample(keep_negatives, random_state=random_state)

        dfs.append(pd.concat([pos, neg], axis=0))

    df = pd.concat(dfs, axis=0).reset_index(drop=True)
    print(f"Total positives: {pos_total}, total negatives before downsample: {neg_total}")
    print(f"Final combined df shape: {df.shape}")
    return df


# ============================================
# 2. PREPROCESS (USES FEATURE ENGINEERING)
# ============================================
def preprocess(
    df,
    id_col=ID_COL,
    target_col=TARGET_COL,
    cat_cols=CATEGORICAL_COLS,
    num_cols=NUMERIC_COLS
):
    print("Preprocessing data...")

    if id_col is not None and id_col in df.columns:
        df = df.drop(columns=[id_col])

    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
        df["Hour"] = df["Timestamp"].dt.hour
        df["Day"] = df["Timestamp"].dt.day
        df["Weekday"] = df["Timestamp"].dt.weekday
        df = df.drop(columns=["Timestamp"])

    if target_col in df.columns:
        df[target_col] = df[target_col].astype(int)

    cat_cols_in_df = [c for c in cat_cols if c in df.columns]

    for col in cat_cols_in_df:
        df[col] = (
            df[col]
            .astype("category")
            .cat.add_categories("Missing")
            .fillna("Missing")
        )

    # Add engineered features
    df = add_feature_engineering(df)

    # Numeric filling (all numeric except target)
    numeric_cols_all = [
        c for c in df.columns
       if (is_numeric_dtype(df[c]) and c != target_col)
    ]

    print("Numeric columns (after feature engineering):", len(numeric_cols_all))
    for col in numeric_cols_all:
        df[col] = df[col].astype(float)
        df[col] = df[col].fillna(df[col].median())

    cat_cols_in_df = [c for c in cat_cols_in_df if c in df.columns]

    low_card_cols, high_card_cols = [], []

    print("Categorical column cardinalities:")
    for col in cat_cols_in_df:
        nunique = df[col].nunique()
        print(f"  {col}: {nunique} unique values")
        if nunique <= MAX_ONEHOT_CARDINALITY:
            low_card_cols.append(col)
        else:
            high_card_cols.append(col)

    # Frequency-encode high-cardinality categoricals
    for col in high_card_cols:
        freq = df[col].value_counts()
        new_col = f"{col}_freq"
        df[new_col] = df[col].map(freq)
        df[new_col] = df[new_col].fillna(0)
        df.drop(columns=[col], inplace=True)

    print(f"Low-cardinality one-hot columns: {low_card_cols}")
    print(f"High-cardinality frequency-encoded columns: {high_card_cols}")

    df_encoded = pd.get_dummies(df, columns=low_card_cols, drop_first=True)

    cols = [c for c in df_encoded.columns if c != target_col] + [target_col]
    df_encoded = df_encoded[cols]

    print("Preprocessing done. Shape:", df_encoded.shape)
    return df_encoded


# ============================================
# 3. TRAIN / VAL SPLIT
# ============================================
def split_train_val(
    df,
    target_col=TARGET_COL,
    test_size=0.2,
    random_state=RANDOM_STATE
):
    X = df.drop(columns=[target_col])
    y = df[target_col].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    print("Train shape:", X_train.shape, "Val shape:", X_val.shape)
    return X_train, X_val, y_train, y_val


# ============================================
# 4. BALANCING & DATASET WRAPPER
# ============================================
def balance_training_data(X, y):
    print("Balancing training data with RandomOverSampler...")
    ros = RandomOverSampler(random_state=RANDOM_STATE)
    X_res, y_res = ros.fit_resample(X, y)
    print(
        "After resampling:",
        X_res.shape,
        "Positives:", int(y_res.sum()),
        "Negatives:", int(len(y_res) - y_res.sum())
    )
    return X_res, y_res


def make_lgb_dataset(X, y):
    return lgb.Dataset(X, label=y, free_raw_data=False)


# ============================================
# 5. OPTUNA OBJECTIVE
# ============================================
def objective(trial, X, y, base_params):
    print(f"Starting trial: {trial.number}")

    params = base_params.copy()
    params.update({
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 500),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
    })

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    pr_aucs, roc_aucs = [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"  Fold {fold+1}/{N_FOLDS}")
        X_tr, X_v = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_v = y.iloc[train_idx], y.iloc[val_idx]

        X_tr_bal, y_tr_bal = balance_training_data(X_tr, y_tr)
        dtrain = make_lgb_dataset(X_tr_bal, y_tr_bal)
        dval = lgb.Dataset(X_v, label=y_v, free_raw_data=False)

        gbm = lgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            valid_sets=[dval],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False)
            ]
        )

        y_pred = gbm.predict(X_v, num_iteration=gbm.best_iteration)
        roc = roc_auc_score(y_v, y_pred)
        p, r, _ = precision_recall_curve(y_v, y_pred)
        pr = auc(r, p)

        roc_aucs.append(roc)
        pr_aucs.append(pr)

    mean_pr = float(np.mean(pr_aucs))
    print(f"Trial {trial.number} finished. Mean PR-AUC: {mean_pr:.4f}")
    return mean_pr


# ============================================
# 6. LOAD + PREPROCESS + SPLIT
# ============================================
df_raw = read_and_downsample(DATA_PATH)
df_processed = preprocess(df_raw)
X_train_full, X_val, y_train_full, y_val = split_train_val(df_processed)


# ============================================
# 7. HYPERPARAMETER TUNING
# ============================================
study = optuna.create_study(direction='maximize')
study.optimize(
    lambda trial: objective(trial, X_train_full, y_train_full, BASE_LGB_PARAMS),
    n_trials=N_TRIALS,
    n_jobs=N_JOBS
)

print("Best trial:")
print("  Value:", study.best_trial.value)
print("  Params:")
for k, v in study.best_trial.params.items():
    print(f"    {k}: {v}")


# ============================================
# 8. SAVE BEST PARAMS & STUDY
# ============================================
best_params = BASE_LGB_PARAMS.copy()
best_params.update(study.best_trial.params)

joblib.dump(best_params, os.path.join(OUTPUT_DIR, "best_lgb_params.pkl"))
joblib.dump(study, os.path.join(OUTPUT_DIR, "optuna_study.pkl"))

print("Best params and study saved.")


# ============================================
# 9. FINAL TRAINING
# ============================================
final_params = best_params.copy()
final_params['num_threads'] = max(1, (os.cpu_count() or 2) - 1)

X_train_bal, y_train_bal = balance_training_data(X_train_full, y_train_full)
dtrain = make_lgb_dataset(X_train_bal, y_train_bal)

X_val = X_val[[c for c in X_train_full.columns if c != TARGET_COL]]
y_val_arr = y_val.values
dval = lgb.Dataset(X_val, label=y_val_arr)

print("Starting final LightGBM training...")
t0 = time.time()
gbm_full = lgb.train(
    final_params,
    dtrain,
    num_boost_round=2000,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)
t1 = time.time()
print(f"Training done in {(t1 - t0):.1f} seconds.")

model_path = os.path.join(OUTPUT_DIR, "final_lgb_model.txt")
gbm_full.save_model(model_path)
print(f"Model saved to {model_path}")

joblib.dump(final_params, os.path.join(OUTPUT_DIR, "final_lgb_params.pkl"))
print("Final params saved.")

y_pred_val = gbm_full.predict(X_val, num_iteration=gbm_full.best_iteration)
pd.DataFrame({'y_true': y_val_arr, 'y_prob': y_pred_val}).to_csv(
    os.path.join(OUTPUT_DIR, "val_predictions.csv"),
    index=False
)
print("Validation predictions saved.")

roc = roc_auc_score(y_val_arr, y_pred_val)
p, r, _ = precision_recall_curve(y_val_arr, y_pred_val)
prauc = auc(r, p)
print(f"Validation ROC-AUC: {roc:.4f}, PR-AUC: {prauc:.4f}")
print(classification_report(y_val_arr, (y_pred_val > 0.5).astype(int)))
print("Confusion matrix:\n", confusion_matrix(y_val_arr, (y_pred_val > 0.5).astype(int)))


Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Total positives: 5177, total negatives before downsample: 5073168
Final combined df shape: (1825177, 11)
Preprocessing data...
Numeric columns (after feature engineering): 25
Categorical column cardinalities:
  Account: 316281 unique values
  Account.1: 311342 unique values
  Receiving Currency: 15 unique values
  Payment Currency: 15 unique values
  Payment Format: 7 unique values
Low-cardinality one-hot co

[I 2025-11-16 15:14:04,768] A new study created in memory with name: no-name-1b0c849a-7bb1-4123-b14d-7ce89cf853f3


Train shape: (1460141, 64) Val shape: (365036, 64)
Starting trial: 0
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 15:15:48,244] Trial 0 finished with value: 0.44198855037817597 and parameters: {'learning_rate': 0.105112855976856, 'num_leaves': 87, 'min_data_in_leaf': 323, 'feature_fraction': 0.839798003361497, 'bagging_fraction': 0.683641576013517, 'bagging_freq': 6, 'lambda_l1': 3.02012210636246, 'lambda_l2': 3.327660787168076}. Best is trial 0 with value: 0.44198855037817597.


Trial 0 finished. Mean PR-AUC: 0.4420
Starting trial: 1
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 15:17:40,254] Trial 1 finished with value: 0.43978373632746903 and parameters: {'learning_rate': 0.10257090153840911, 'num_leaves': 108, 'min_data_in_leaf': 218, 'feature_fraction': 0.868612046567186, 'bagging_fraction': 0.650230882068405, 'bagging_freq': 10, 'lambda_l1': 0.2979241522486714, 'lambda_l2': 3.3263826010925532}. Best is trial 0 with value: 0.44198855037817597.


Trial 1 finished. Mean PR-AUC: 0.4398
Starting trial: 2
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 15:54:32,651] Trial 2 finished with value: 0.460415580658329 and parameters: {'learning_rate': 0.06719218114359057, 'num_leaves': 189, 'min_data_in_leaf': 499, 'feature_fraction': 0.7275012352524779, 'bagging_fraction': 0.7589182209524341, 'bagging_freq': 1, 'lambda_l1': 2.053564656276536, 'lambda_l2': 1.8220184672216204}. Best is trial 2 with value: 0.460415580658329.


Trial 2 finished. Mean PR-AUC: 0.4604
Starting trial: 3
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 15:55:47,159] Trial 3 finished with value: 0.4441917074385171 and parameters: {'learning_rate': 0.11751449658408694, 'num_leaves': 134, 'min_data_in_leaf': 298, 'feature_fraction': 0.6241473515730315, 'bagging_fraction': 0.9471979773725262, 'bagging_freq': 5, 'lambda_l1': 3.2864826398242593, 'lambda_l2': 4.111883517814671}. Best is trial 2 with value: 0.460415580658329.


Trial 3 finished. Mean PR-AUC: 0.4442
Starting trial: 4
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 15:59:00,575] Trial 4 finished with value: 0.4501726436966337 and parameters: {'learning_rate': 0.022838254135343253, 'num_leaves': 89, 'min_data_in_leaf': 158, 'feature_fraction': 0.6144141443860571, 'bagging_fraction': 0.9451867062601269, 'bagging_freq': 0, 'lambda_l1': 3.10984487525229, 'lambda_l2': 3.1395127225223445}. Best is trial 2 with value: 0.460415580658329.


Trial 4 finished. Mean PR-AUC: 0.4502
Starting trial: 5
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 18:00:43,410] Trial 5 finished with value: 0.41772182261576196 and parameters: {'learning_rate': 0.07010912075451153, 'num_leaves': 55, 'min_data_in_leaf': 185, 'feature_fraction': 0.8950248804351115, 'bagging_fraction': 0.7811810741523095, 'bagging_freq': 0, 'lambda_l1': 0.5241957588226287, 'lambda_l2': 0.7506942832909097}. Best is trial 2 with value: 0.460415580658329.


Trial 5 finished. Mean PR-AUC: 0.4177
Starting trial: 6
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 18:03:53,058] Trial 6 finished with value: 0.4705191517438958 and parameters: {'learning_rate': 0.02831801431478083, 'num_leaves': 150, 'min_data_in_leaf': 155, 'feature_fraction': 0.6478254128653478, 'bagging_fraction': 0.8488020322617558, 'bagging_freq': 7, 'lambda_l1': 4.7118528368351, 'lambda_l2': 4.0054122968063615}. Best is trial 6 with value: 0.4705191517438958.


Trial 6 finished. Mean PR-AUC: 0.4705
Starting trial: 7
  Fold 1/2
Balancing training data with RandomOverSampler...
After resampling: (1455998, 64) Positives: 727999 Negatives: 727999
  Fold 2/2
Balancing training data with RandomOverSampler...
After resampling: (1456000, 64) Positives: 728000 Negatives: 728000


[I 2025-11-16 18:06:14,967] Trial 7 finished with value: 0.4332010817693698 and parameters: {'learning_rate': 0.03401780934686374, 'num_leaves': 114, 'min_data_in_leaf': 333, 'feature_fraction': 0.9665469599063847, 'bagging_fraction': 0.9646103064363295, 'bagging_freq': 10, 'lambda_l1': 0.3786059818679294, 'lambda_l2': 2.619132804105346}. Best is trial 6 with value: 0.4705191517438958.


Trial 7 finished. Mean PR-AUC: 0.4332
Best trial:
  Value: 0.4705191517438958
  Params:
    learning_rate: 0.02831801431478083
    num_leaves: 150
    min_data_in_leaf: 155
    feature_fraction: 0.6478254128653478
    bagging_fraction: 0.8488020322617558
    bagging_freq: 7
    lambda_l1: 4.7118528368351
    lambda_l2: 4.0054122968063615
Best params and study saved.
Balancing training data with RandomOverSampler...
After resampling: (2911998, 64) Positives: 1455999 Negatives: 1455999
Starting final LightGBM training...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.978139	valid_0's average_precision: 0.352922
[200]	valid_0's auc: 0.979849	valid_0's average_precision: 0.428773
[300]	valid_0's auc: 0.980032	valid_0's average_precision: 0.456152
[400]	valid_0's auc: 0.980391	valid_0's average_precision: 0.476059
[500]	valid_0's auc: 0.980524	valid_0's average_precision: 0.48865
Early stopping, best iteration is:
[505]	valid_0's auc: 0.980555	valid_0's 