In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
import xgboost as xgb
from sklearn.metrics import classification_report, average_precision_score, precision_recall_curve, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %% Load datasets
file_paths = [
    '/Users/parthgajera/Documents/Thesis_Data/LI-Medium_Trans.csv',
    '/Users/parthgajera/Documents/Thesis_Data/LI-Small_Trans.csv'
]

datasets = [pd.read_csv(fp) for fp in file_paths]

# Rename column 'Is Laundering' to 'is_laundering'
datasets[0].rename(columns={'Is Laundering': 'is_laundering'}, inplace=True)
datasets[1].rename(columns={'Is Laundering': 'is_laundering'}, inplace=True)


# %% Feature Engineering Function
def feature_engineering(df):
    df = df.copy()
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Hour'] = df['Timestamp'].dt.hour
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
    df['SameBank'] = (df['From Bank'] == df['To Bank']).astype(int)
    df['SameAccount'] = (df['Account'] == df['To Bank']).astype(int)
    df['CurrencyMismatch'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)
    df['Txn Count From Account'] = df.groupby('Account')['Timestamp'].transform('count')
    df['Rolling Avg From Account'] = df.sort_values(by='Timestamp') \
        .groupby('Account')['Amount Paid'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    
    # Ensure 'is_laundering' is retained in the dataset
    if 'is_laundering' in df.columns:
        return df
    else:
        raise KeyError("Column 'is_laundering' is missing from the input data.")

# %% Features for Isolation Forest and Hybrid Model
iso_features = [
    'Amount Received', 'Amount Paid',
    'SameBank', 'SameAccount', 'CurrencyMismatch',
    'Txn Count From Account', 'Rolling Avg From Account',
    'Payment Format'
]

hybrid_features = [
    'Amount Received', 'Amount Paid',
    'SameBank', 'SameAccount', 'CurrencyMismatch',
    'Txn Count From Account', 'Rolling Avg From Account',
    'anomaly_score'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Payment Format']),
    ],
    remainder='passthrough'
)

In [3]:
# testing the datasets if they have a same data or not

df1_raw = datasets[0].copy()
df2_raw = datasets[1].copy()

df1_raw = df1_raw.reindex(sorted(df1_raw.columns), axis=1)
df2_raw = df2_raw.reindex(sorted(df2_raw.columns), axis=1)

common_rows = pd.merge(df1_raw.drop_duplicates(), df2_raw.drop_duplicates(), how='inner')

print(f"Number of identical rows in both datasets: {len(common_rows)}")

Number of identical rows in both datasets: 0


In [3]:
# Check the column names in the dataset
print(datasets[0].columns)


Index(['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'is_laundering'],
      dtype='object')


In [4]:

# %% Training Isolation Forest
train_df = feature_engineering(datasets[0])

# Prepare features for Isolation Forest (excluding 'is_laundering')
X_train_iso = train_df[iso_features]

# Isolation Forest Pipeline
iso_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('iso_forest', IsolationForest(
        n_estimators=100,
        max_samples=50000,
        contamination=0.02,
        max_features=1.0,
        random_state=42,
        n_jobs=-1
    ))
])

iso_pipeline.fit(X_train_iso)

# Add anomaly scores and labels
X_train_iso_transformed = iso_pipeline.named_steps['preprocess'].transform(X_train_iso)
train_df['anomaly_score'] = iso_pipeline.named_steps['iso_forest'].decision_function(X_train_iso_transformed)
train_df['anomaly_label'] = iso_pipeline.named_steps['iso_forest'].predict(X_train_iso_transformed)
train_df['anomaly_label'] = train_df['anomaly_label'].map({1: 'Normal', -1: 'Anomaly'})
train_df['hybrid_label'] = train_df['anomaly_label'].map({'Anomaly': 1, 'Normal': 0})







In [None]:
# Preparing Hybrid Data

X_hybrid = train_df[hybrid_features]
y_hybrid = train_df['is_laundering']

In [None]:
# Prepare data for training the hybrid modeL
X_hybrid = train_df[hybrid_features]
y_hybrid = train_df['is_laundering']

# Downsampling based on 'is_laundering'
minority_df = train_df[train_df['is_laundering'] != 0]
majority_df = train_df[train_df['is_laundering'] == 0].sample(frac=0.01, random_state=42)

hybrid_sample_df = pd.concat([minority_df, majority_df])

X_hybrid_sample = hybrid_sample_df[hybrid_features]
y_hybrid_sample = hybrid_sample_df['is_laundering']

print(f"After downsampling:\n{y_hybrid_sample.value_counts()}")

After downsampling:
is_laundering
0    312354
1     16041
Name: count, dtype: int64


In [None]:
# SMOTETomek for resampling
smote_tomek = SMOTETomek(random_state=42, sampling_strategy=0.5)
X_resampled, y_resampled = smote_tomek.fit_resample(X_hybrid_sample, y_hybrid_sample)

print(f"After SMOTETomek resampling:\n{y_resampled.value_counts()}")

After SMOTETomek resampling:
is_laundering
0    298428
1    142251
Name: count, dtype: int64


In [None]:
# Alternative Skip SMOTETomek
X_resampled = X_hybrid_sample
y_resampled = y_hybrid_sample

In [23]:
print(f"After SMOTETomek resampling:\n{y_resampled.value_counts()}")

After SMOTETomek resampling:
is_laundering
0    312354
1     16041
Name: count, dtype: int64


In [None]:
# Split for Optuna optimization
X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Bayesian Optimization with Optuna
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'n_jobs': -1,
        'random_state': 42,
        'tree_method': 'hist',
        'verbosity': 0
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return f1_score(y_val, preds, average='macro')

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Train final model with best parameters
best_params = study.best_params
best_params.update({
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'n_jobs': -1,
    'random_state': 42,
    'tree_method': 'hist',
    'verbosity': 0
})

model = xgb.XGBClassifier(**best_params)
model.fit(X_resampled, y_resampled)

print("Best Parameters from Optuna:\n", best_params)

[I 2025-05-03 23:26:23,174] A new study created in memory with name: no-name-f5ba6aa8-fd29-4398-b015-85e84a08b9ef
[I 2025-05-03 23:26:29,001] Trial 0 finished with value: 0.9359357959941541 and parameters: {'n_estimators': 771, 'max_depth': 10, 'learning_rate': 0.1893095787640366, 'subsample': 0.6474281827612054, 'colsample_bytree': 0.9683321496609003, 'gamma': 0.48365372733997714, 'reg_alpha': 2.9005544746535907, 'reg_lambda': 4.409543606438956}. Best is trial 0 with value: 0.9359357959941541.
[I 2025-05-03 23:26:29,827] Trial 1 finished with value: 0.9065506404411663 and parameters: {'n_estimators': 166, 'max_depth': 7, 'learning_rate': 0.15382540159123181, 'subsample': 0.9220893994251356, 'colsample_bytree': 0.874695162476864, 'gamma': 1.785153487959053, 'reg_alpha': 3.6746727507112595, 'reg_lambda': 6.7821199323895245}. Best is trial 0 with value: 0.9359357959941541.
[I 2025-05-03 23:26:31,450] Trial 2 finished with value: 0.9002611642169752 and parameters: {'n_estimators': 789, 'm

Best Parameters from Optuna:
 {'n_estimators': 771, 'max_depth': 10, 'learning_rate': 0.1893095787640366, 'subsample': 0.6474281827612054, 'colsample_bytree': 0.9683321496609003, 'gamma': 0.48365372733997714, 'reg_alpha': 2.9005544746535907, 'reg_lambda': 4.409543606438956, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'n_jobs': -1, 'random_state': 42, 'tree_method': 'hist', 'verbosity': 0}


In [None]:
# Prepare Test Set
test_df = feature_engineering(datasets[1])
X_test_iso = test_df[iso_features]
X_test_iso_transformed = iso_pipeline.named_steps['preprocess'].transform(X_test_iso)

test_df['anomaly_score'] = iso_pipeline.named_steps['iso_forest'].decision_function(X_test_iso_transformed)
test_df['anomaly_label'] = iso_pipeline.named_steps['iso_forest'].predict(X_test_iso_transformed)
test_df['anomaly_label'] = test_df['anomaly_label'].map({1: 'Normal', -1: 'Anomaly'})
test_df['hybrid_label'] = test_df['anomaly_label'].map({'Anomaly': 1, 'Normal': 0})

X_test = test_df[hybrid_features]
y_test = test_df['is_laundering']

# Predict
y_pred = model.predict(X_test)

# Report
print("Classification Report on Dataset[1]:")
print(classification_report(y_test, y_pred, digits=4))

Classification Report on Dataset[1]:
              precision    recall  f1-score   support

           0     0.9998    0.9139    0.9549   6920484
           1     0.0040    0.6668    0.0079      3565

    accuracy                         0.9138   6924049
   macro avg     0.5019    0.7903    0.4814   6924049
weighted avg     0.9993    0.9138    0.9544   6924049

