In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
from sklearn.ensemble import IsolationForest

In [15]:
df= pd.read_csv('SAML-D.csv')

In [17]:
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (3531894, 12)
       Time        Date  Sender_account  Receiver_account    Amount  \
0  10:35:19  2022-10-07      8724731955        2769355426   1459.15   
1  10:35:20  2022-10-07      1491989064        8401255335   6019.64   
2  10:35:20  2022-10-07       287305149        4404767002  14328.44   
3  10:35:21  2022-10-07      5376652437        9600420220  11895.00   
4  10:35:21  2022-10-07      9614186178        3803336972    115.25   

  Payment_currency Received_currency Sender_bank_location  \
0        UK pounds         UK pounds                   UK   
1        UK pounds            Dirham                   UK   
2        UK pounds         UK pounds                   UK   
3        UK pounds         UK pounds                   UK   
4        UK pounds         UK pounds                   UK   

  Receiver_bank_location  Payment_type  Is_laundering       Laundering_type  
0                     UK  Cash Deposit            0.0  Normal_Cash_Deposits  
1                    

In [34]:
labeled_df = df.dropna(subset=['Is_laundering']).copy()
unlabeled_df = df[df['Is_laundering'].isna()].copy()

In [35]:
cat_cols = ['Payment_currency', 'Received_currency',
            'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']

In [38]:
for col in cat_cols:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col].astype(str))
  labeled_df[col] = le.transform(labeled_df[col].astype(str))
  if not unlabeled_df.empty:
    unlabeled_df[col] = le.transform(unlabeled_df[col].astype(str))

In [39]:
features = ['Amount', 'Payment_currency',
            'Received_currency', 'Sender_bank_location',
            'Receiver_bank_location', 'Payment_type']

In [40]:
if not labeled_df.empty:
  X = labeled_df[features].astype(float)
  y = labeled_df['Is_laundering'].astype(int)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42 )

In [42]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [43]:
params = { 'objective': 'binary',
          'metric': 'auc',
           'is_unbalance': True,
           'learning_rate': 0.05,
           'num_leaves': 31,
           'random_state': 42 }

In [45]:
model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=200)

[LightGBM] [Info] Number of positive: 2392, number of negative: 2469933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.306987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 2472325, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000968 -> initscore=-6.939816
[LightGBM] [Info] Start training from score -6.939816


In [46]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

In [47]:
print("\n--- Supervised Results ---")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))



--- Supervised Results ---
Confusion Matrix:
 [[425622 632921]
 [   259    766]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.40      0.57   1058543
           1       0.00      0.75      0.00      1025

    accuracy                           0.40   1059568
   macro avg       0.50      0.57      0.29   1059568
weighted avg       1.00      0.40      0.57   1059568

ROC AUC: 0.5746972229177505


In [55]:
results = X_test.copy()
results['predicted_risk'] = y_pred_prob
results['actual_label'] = y_test.values
print("\nTop suspicious (labeled data):")
print(results.sort_values('predicted_risk', ascending=False).head(10))


Top suspicious (labeled data):
          Amount  Payment_currency  Received_currency  Sender_bank_location  \
2740552  2025.61               2.0                2.0                   8.0   
3373521  4986.81               2.0                2.0                   8.0   
1518078  5048.33               2.0                2.0                   8.0   
2861892  4633.19               2.0                2.0                   8.0   
1678937    39.77               2.0                2.0                   8.0   
503252   4680.56              12.0                2.0                   5.0   
262841   3650.18               2.0                2.0                   8.0   
2617306  1220.41               2.0                2.0                   8.0   
3374550  6501.12               2.0                2.0                   8.0   
2963728  9498.52               2.0                2.0                   8.0   

         Receiver_bank_location  Payment_type  predicted_risk  actual_label  
2740552             

In [59]:
if not unlabeled_df.empty:
  X_unlabeled = unlabeled_df[features].astype(float).fillna(0)
  iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
  iso.fit(X_unlabeled)
  unlabeled_df['anomaly_score'] = -iso.decision_function(X_unlabeled)
  unlabeled_df['risk_rank'] = unlabeled_df['anomaly_score'].rank(ascending=False)
  print("\n--- Unsupervised Results ---")
  print("Top suspicious (unlabeled data):")
  print(unlabeled_df[['Sender_account','Receiver_account',
                      'Amount', 'Payment_type','anomaly_score']].sort_values('anomaly_score', ascending=False).head(10))
else: print("No unlabeled data available for unsupervised learning.")


--- Unsupervised Results ---
Top suspicious (unlabeled data):
         Sender_account  Receiver_account  Amount  Payment_type  anomaly_score
3531893      6637479118        5978421817    52.0             7           -0.0


In [68]:
df[df['Sender_account']==6637479118]

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
16402,20:58:38,2022-10-07,6637479118,5978421817,465.13,2,2,8,8,6,0.0,Normal_Small_Fan_Out
95692,16:56:53,2022-10-10,6637479118,5103530344,3912.36,2,2,8,8,6,0.0,Normal_Small_Fan_Out
123838,16:58:39,2022-10-11,6637479118,8686682695,144.79,2,2,8,8,6,0.0,Normal_Small_Fan_Out
219853,21:07:16,2022-10-14,6637479118,7418710634,18557.62,2,2,8,8,4,0.0,Normal_Small_Fan_Out
259400,10:52:40,2022-10-16,6637479118,8611790147,73.76,2,2,8,8,2,0.0,Normal_Cash_Withdrawal
...,...,...,...,...,...,...,...,...,...,...,...,...
3383646,22:11:30,2023-01-30,6637479118,1442716327,1955.34,2,2,8,8,3,0.0,Normal_Small_Fan_Out
3386828,01:19:29,2023-01-31,6637479118,8186670102,1013.84,2,2,8,8,0,0.0,Normal_Small_Fan_Out
3475689,19:52:46,2023-02-02,6637479118,5377776363,3396.19,2,2,8,8,6,0.0,Normal_Small_Fan_Out
3501353,16:28:58,2023-02-03,6637479118,5978421817,562.00,2,2,8,8,6,0.0,Normal_Small_Fan_Out
