<a href="https://colab.research.google.com/github/Veewy/AML/blob/main/AML_2)_Dataset_DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🌹 **Anti Money Laundering**  🌹

# **Data** **Import**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import torch
import torch.nn as nn
import torch.optim as optim
import os

from torch.optim.lr_scheduler import ReduceLROnPlateau
from functools import partial
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from torch.nn.utils.rnn import pack_sequence

## Re-loading Intermediate Data

In [None]:
# reloading
from google.colab import drive
drive.mount('/content/drive')
path_df_test2 = "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/df_test2.parquet"
df_test2 = pd.read_parquet(path_df_test2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Train / Test Split**

Down negative class_Full Dataset (with a.c. interaction)
deal with imbalanced dataset


### Train / validation / Test

In [None]:
# split the entire dataset (df_test2) into training (80%) and evaluation+test (20%) sets.

X_events2 = df_test2['events']  # Features
y_targets2 = df_test2['targets']  # Targets

X_events_train2, X_events_eval_test2, y_targets_train2, y_targets_eval_test2 = train_test_split(X_events2, y_targets2, test_size=0.2, random_state=42)

# create datasest
#train_dataset2 = pd.concat([X_events_train, y_targets_train], axis=1)
#eval_test_dataset2 = pd.concat([X_events_eval_test, y_targets_eval_test], axis=1)

In [None]:
#downsize only train set to handle data imbalance
y_targets_train_pos2 = y_targets_train2[y_targets_train2.apply(sum) > 0]
y_targets_train_neg2 = y_targets_train2[y_targets_train2.apply(sum) == 0]
y_targets_train_neg_sampled2 = y_targets_train_neg2.sample(frac=0.105, random_state=42)
y_targets_train_downsampled2 = pd.concat([y_targets_train_neg_sampled2, y_targets_train_pos2]).sample(frac=1, random_state=42)
X_events_train_downsampled2 = X_events_train2[y_targets_train_downsampled2.index]

# create datasest
#train_dataset_downsampled2 = pd.concat([X_events_train_downsampled, y_targets_train_downsampled], axis=1)

In [None]:
len(y_targets_train_downsampled2), len(y_targets_eval_test2), y_targets_train_downsampled2.apply(sum).sum(), y_targets_eval_test2.apply(sum).sum()


(77477, 171092, 15721, 4025)

In [None]:
15721/77477

0.2029118318984989

In [None]:
15721*5

78605

In [None]:
# split portion of evaluation / test into evaluation (10%) and test (10%) sets.
X_events_eval2, X_events_test2, y_targets_eval2, y_targets_test2 = train_test_split(X_events_eval_test2, y_targets_eval_test2, test_size=0.5, random_state=42)

# create datasest
#eval_dataset2 = pd.concat([X_events_eval, y_targets_eval], axis=1)
#test_dataset2 = pd.concat([X_events_test, y_targets_test], axis=1)


In [None]:
#check size of eval and test
len(y_targets_eval2), len(y_targets_test2), y_targets_eval2.apply(sum).sum(), y_targets_test2.apply(sum).sum()


(85546, 85546, 2126, 1899)

In [None]:
# Create datasets
train_dataset_downsampled2 = pd.concat([X_events_train_downsampled2, y_targets_train_downsampled2], axis=1)
train_dataset2 = pd.concat([X_events_train2, y_targets_train2], axis=1)  # Full train dataset (no downsampling)
eval_dataset2 = pd.concat([X_events_eval2, y_targets_eval2], axis=1)
test_dataset2 = pd.concat([X_events_test2, y_targets_test2], axis=1)

In [None]:
# Print sizes for verification
print("Train Dataset (Downsampled):", train_dataset_downsampled2.shape)
print("Train Dataset (Full):", train_dataset2.shape)
print("Eval Dataset:", eval_dataset2.shape)
print("Test Dataset:", test_dataset2.shape)

Train Dataset (Downsampled): (77477, 2)
Train Dataset (Full): (684368, 2)
Eval Dataset: (85546, 2)
Test Dataset: (85546, 2)


# **DataSet** / **DataLoader**

In [None]:
class AmlDataset(Dataset):
    def __init__(self, data, features):
        """
        @param data: pdf whose index is monotonically increases from 0
        @param features: list of features to be used in an event
        """
        self.data = data
        self.features = features

    def __getitem__(self, index):
        psr_sample = self.data.iloc[index] #retrieves row at specified index from the data.
        list_y = psr_sample["targets"]
        list_x = []

        for event in psr_sample["events"]:
          x = [event[feature] for feature in self.features] #create X
          list_x.append(x)

        #Converts the extracted features (list_x) and targets (list_y) to NumPy arrays, Returns a tuple (x, y).
        return np.array(list_x).astype(np.float32), np.array(list_y).astype(np.float32)

    def __len__(self):
        return len(self.data)

In [None]:
# select some related columns to be in training dataset (down neg. sample)
aml_dataset_id_downs2 = AmlDataset(
    train_dataset_downsampled2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_id_downs2 = DataLoader(
    aml_dataset_id_downs2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_id_downs2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7fa90>

In [None]:
# eval_dataset.to_parquet("/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/eval_dataset.parquet")


In [None]:
# define aml_eval_dataloader --> for evaluation
aml_eval_dataset_id2 = AmlDataset(
    eval_dataset2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_eval_dataloader_id2 = DataLoader(
    aml_eval_dataset_id2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_id2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7d690>

In [None]:
# test_dataset.to_parquet("/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/test_dataset.parquet")


In [None]:
# define aml_eval_dataloader --> for testing dataset
aml_test_dataset_id2 = AmlDataset(
    test_dataset2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_test_dataloader_id2 = DataLoader(
    aml_test_dataset_id2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_test_dataloader_id2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7e8c0>

# Save path of Dataset/DataLoader

In [None]:
# for training part

torch.save(aml_dataset_id_downs2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_id_downs2.pth")
torch.save(aml_dataloader_id_downs2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_id_downs2.pth")

In [None]:
# for evaluating part

torch.save(aml_eval_dataset_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_id2.pth")
torch.save(aml_eval_dataloader_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_id2.pth")

In [None]:
# for testing part

torch.save(aml_test_dataset_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_test_dataset_id2.pth")
torch.save(aml_test_dataloader_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_test_dataloader_id2.pth")