In [2]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb


In [3]:
# -------------------------------------------------
# 1. Load all your datasets
# -------------------------------------------------
# Update the file paths accordingly
file_paths = [
    '/Users/parthgajera/Documents/Thesis_Data/LI-Medium_Trans.csv',
    '/Users/parthgajera/Documents/Thesis_Data/LI-Small_Trans.csv'
]

datasets = [pd.read_csv(fp) for fp in file_paths]

In [8]:
def load_and_prepare(file_paths):
    print(f"Loading datasets: {file_paths}")
    df = dd.read_csv(file_paths)  # loads multiple files if needed

    # Feature Engineering
    df['Timestamp'] = dd.to_datetime(df['Timestamp'], errors='coerce')
    df['Hour'] = df['Timestamp'].dt.hour
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
    df['SameBank'] = (df['From Bank'] == df['To Bank']).astype(int)
    df['SameAccount'] = (df['Account'] == df['To Bank']).astype(int)
    df['CurrencyMismatch'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)

    df['Txn Count From Account'] = df.groupby('Account')['Timestamp'].transform('count')

    df = df.set_index('Timestamp')  # Temporarily set Timestamp as index for rolling
    df = df.map_partitions(lambda pdf: pdf.assign(
        RollingAvgFromAccount=pdf.groupby('Account')['Amount Paid'].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    ))
    df = df.reset_index()

    return df

In [9]:
train_df = load_and_prepare(train_files).compute()
test_df = load_and_prepare(test_files).compute()

Loading datasets: ['/Users/parthgajera/Documents/Thesis_Data/LI-Medium_Trans.csv']


ArrowNotImplementedError: Function 'equal' has no kernel matching input types (large_string, double)

In [4]:
# -------------------------------------------------
# 2. Define Feature Engineering
# -------------------------------------------------
def feature_engineering(df):
    df = df.copy()
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Hour'] = df['Timestamp'].dt.hour
    df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
    df['SameBank'] = (df['From Bank'] == df['To Bank']).astype(int)
    df['SameAccount'] = (df['Account'] == df['To Bank']).astype(int)
    df['CurrencyMismatch'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)
    df['Txn Count From Account'] = df.groupby('Account')['Timestamp'].transform('count')
    df['Rolling Avg From Account'] = df.sort_values(by='Timestamp') \
        .groupby('Account')['Amount Paid'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
    return df

In [5]:
# -------------------------------------------------
# 3. Feature lists
# -------------------------------------------------
features = [
    'Amount Received', 'Amount Paid',
    'SameBank', 'SameAccount', 'CurrencyMismatch',
    'Txn Count From Account', 'Rolling Avg From Account',
    'Payment Format'
]

hybrid_features = [
    'Amount Received', 'Amount Paid',
    'SameBank', 'SameAccount', 'CurrencyMismatch',
    'Txn Count From Account', 'Rolling Avg From Account',
    'anomaly_score'
]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Payment Format']),
    ],
    remainder='passthrough'
)

In [6]:
# -------------------------------------------------
# 4. SET PARAMETERS
# -------------------------------------------------
train_indices = [0]   # which datasets to train on
test_indices = [1] # which datasets to test on

In [7]:
# -------------------------------------------------
# 5. Training
# -------------------------------------------------
# Prepare training data
train_dfs = [feature_engineering(datasets[idx]) for idx in train_indices]
train_df = pd.concat(train_dfs, ignore_index=True)

X_train_iso = train_df[features]

# Isolation Forest pipeline
iso_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('iso_forest', IsolationForest(contamination='auto', random_state=42))
])

iso_pipeline.fit(X_train_iso)

# Add anomaly scores and labels
X_train_iso_transformed = iso_pipeline.named_steps['preprocess'].transform(X_train_iso)
train_df['anomaly_score'] = iso_pipeline.named_steps['iso_forest'].decision_function(X_train_iso_transformed)
train_df['anomaly_label'] = iso_pipeline.named_steps['iso_forest'].predict(X_train_iso_transformed)
train_df['anomaly_label'] = train_df['anomaly_label'].map({1: 'Normal', -1: 'Anomaly'})
train_df['hybrid_label'] = train_df['anomaly_label'].map({'Anomaly': 1, 'Normal': 0})

# Train XGBoost on the hybrid labels
X_hybrid_train = train_df[hybrid_features]
y_hybrid_train = train_df['hybrid_label']

xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_hybrid_train, y_hybrid_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
# -------------------------------------------------
# 6. Testing
# -------------------------------------------------
for idx in test_indices:
    test_df = feature_engineering(datasets[idx])
    
    X_test_iso = test_df[features]
    X_test_iso_transformed = iso_pipeline.named_steps['preprocess'].transform(X_test_iso)
    
    # Anomaly scores
    test_df['anomaly_score'] = iso_pipeline.named_steps['iso_forest'].decision_function(X_test_iso_transformed)
    
    # Prepare hybrid features
    X_hybrid_test = test_df[hybrid_features]
    
    # Predict with XGBoost
    y_pred = xgb_model.predict(X_hybrid_test)
    
    # If you have true labels, adjust here
    if 'Is Laundering' in test_df.columns:
        #y_true = test_df['Is Laundering'].map({'Yes': 1, 'No': 0})  # adapt if labels are different
        y_true = test_df['Is Laundering'].map({'Yes': 1, 'No': 0}).fillna(0)
        print(f"\n--- Results for Dataset {idx+1} ---")
        print(classification_report(y_true, y_pred, digits=4))
    else:
        print(f"\nDataset {idx+1}: Predictions only (no true labels provided)")



--- Results for Dataset 2 ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0     1.0000    0.9102    0.9530   6924049
         1.0     0.0000    0.0000    0.0000         0

    accuracy                         0.9102   6924049
   macro avg     0.5000    0.4551    0.4765   6924049
weighted avg     1.0000    0.9102    0.9530   6924049



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
print(test_df['Is Laundering'].value_counts())


Is Laundering
0    6920484
1       3565
Name: count, dtype: int64
