<a href="https://colab.research.google.com/github/Veewy/AML/blob/main/AML_2)_Dataset_DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🌹 **Anti Money Laundering**  🌹

# **Data** **Import**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import torch
import torch.nn as nn
import torch.optim as optim
import os

from torch.optim.lr_scheduler import ReduceLROnPlateau
from functools import partial
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from torch.nn.utils.rnn import pack_sequence

## Re-loading Intermediate Data

In [None]:
# reloading
from google.colab import drive
drive.mount('/content/drive')
path_df_test2 = "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/df_test2.parquet"
df_test2 = pd.read_parquet(path_df_test2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Train / Test Split**

## a) Sampled Dataset

Split 100,000 sampled data

In [None]:
# split small dataset for runing as sample
df_test2_sampled = df_test2.sample(100000, random_state=42)
df_test2_sampled.sort_values(by='account')

Unnamed: 0_level_0,events,targets
account,Unnamed: 1_level_1,Unnamed: 2_level_1
61549,"[{'account': 61549.0, 'account_encoded': 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
155434,"[{'account': 155434.0, 'account_encoded': 0.0,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
201484,"[{'account': 201484.0, 'account_encoded': 0.0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
251764,"[{'account': 251764.0, 'account_encoded': 0.0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
253618,"[{'account': 253618.0, 'account_encoded': 0.0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
9999529026,"[{'account': 9999529026.0, 'account_encoded': ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9999665726,"[{'account': 9999665726.0, 'account_encoded': ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9999758711,"[{'account': 9999758711.0, 'account_encoded': ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9999803740,"[{'account': 9999803740.0, 'account_encoded': ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
print(df_test2_sampled['events'].iloc[0])

[{'account': 2360440967.0, 'account_encoded': 0.0, 'account_interaction': 5533441702.0, 'account_interaction_encoded': 23976.0, 'amount_normalized': 0.25471404818530874, 'day_of_week_encoded': 5.0, 'hour': 14.0, 'payment_currency_encoded': 10.0, 'received_currency_encoded': 10.0, 'receiver_bank_location_encoded': 16.0, 'sender_bank_location_encoded': 16.0, 'time_interval_normalized': -0.054420741832058306, 'transaction_type_encoded': 1.0}
 {'account': 2360440967.0, 'account_encoded': 0.0, 'account_interaction': 5533441702.0, 'account_interaction_encoded': 23976.0, 'amount_normalized': -0.247170849481796, 'day_of_week_encoded': 3.0, 'hour': 1.0, 'payment_currency_encoded': 10.0, 'received_currency_encoded': 10.0, 'receiver_bank_location_encoded': 16.0, 'sender_bank_location_encoded': 16.0, 'time_interval_normalized': 9.565080879378325, 'transaction_type_encoded': 1.0}
 {'account': 2360440967.0, 'account_encoded': 0.0, 'account_interaction': 5533441702.0, 'account_interaction_encoded': 2

##  🌹 df_test2_sampled : Save to Parguet 🌹
(for future quick re-running code)

In [None]:
# save df_test2 "sampled " to parguet
path_df_test2_sampled = "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/df_test2_sampled.parquet"
df_test2_sampled.to_parquet(path_df_test2_sampled)

In [None]:
# ---> Start here for quick reloading sampled dataset
path_df_test2_sampled = "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/df_test2_sampled.parquet"
df_test2_sampled = pd.read_parquet(path_df_test2_sampled)

Train / Test Split_Sampled Dataset

In [None]:
# Split (df_test2_sampled) into training and evaluation sets (80% train, 20% eval)

X_events = df_test2_sampled['events']  # Features
y_targets = df_test2_sampled['targets']  # Targets

X_events_train, X_events_test, y_targets_train, y_targets_test = train_test_split(X_events, y_targets, test_size=0.2, random_state=42)

# create datasest
train_sampled_dataset = pd.concat([X_events_train, y_targets_train], axis=1)
test_sampled_dataset = pd.concat([X_events_test, y_targets_test], axis=1)

In [None]:
len(y_targets_train), len(y_targets_test), y_targets_train.apply(sum).sum(), y_targets_test.apply(sum).sum()
# only ~2.2% of all samples (users) are positive samples

(80000, 20000, 1934, 435)

In [None]:
1934+435

2369

In [None]:
2368/10000

0.2368

## b) Downsampling Negative Sampled Dataset  (only in Training Part)

(Addressing Data Imbalance):

In [None]:
# Create a new sampled dataset with negative class downsampled in the training part

y_targets_train_pos = y_targets_train[y_targets_train.apply(sum) > 0]
y_targets_train_neg = y_targets_train[y_targets_train.apply(sum) == 0]
y_targets_train_neg_sampled = y_targets_train_neg.sample(frac=0.112)
y_targets_train_downsampled = pd.concat([y_targets_train_neg_sampled, y_targets_train_pos]).sample(frac=1)
X_events_train_downsampled = X_events_train[y_targets_train_downsampled.index]

# create datasest
train_sampled_dataset_downsampled = pd.concat([X_events_train_downsampled, y_targets_train_downsampled], axis=1)

In [None]:
len(y_targets_train_downsampled), len(y_targets_test), y_targets_train_downsampled.apply(sum).sum(), y_targets_test.apply(sum).sum()

(9638, 20000, 1934, 435)

In [None]:
1934*5

9670

## C) sampled data downsizing neg. (with a/c)

In [None]:
# use scenario b for training -
# use scenario a for testing - but select new feature as a/c , a.c  interact in dataset/dataloader

## d/e) Full Dataset (both scenarios of w.o & with a.c. interaction)

In [None]:
# split the entire dataset (df_test2) into training (90%) and evaluation (10%) sets.

X_events1 = df_test2['events']
y_targets1 = df_test2['targets']

X_events_train1, X_events_test1, y_targets_train1, y_targets_test1 = train_test_split(X_events1, y_targets1, test_size=0.1, random_state=42)

# create datasest
train_dataset = pd.concat([X_events_train1, y_targets_train1], axis=1)
test_dataset = pd.concat([X_events_test1, y_targets_test1], axis=1)

In [None]:
len(y_targets_train1), len(y_targets_test1), y_targets_train1.apply(sum).sum(), y_targets_test1.apply(sum).sum()
# only ~2.3% of all samples (users) are positive samples

(769914, 85546, 17845, 1901)

In [None]:
17845+1901

19746

In [None]:
769914+85546

855460

## f) Down negative class_Full Dataset (with a.c. interaction)
deal with imbalanced dataset


### Train / validation / Test

In [None]:
# split the entire dataset (df_test2) into training (80%) and evaluation+test (20%) sets.

X_events2 = df_test2['events']  # Features
y_targets2 = df_test2['targets']  # Targets

X_events_train2, X_events_eval_test2, y_targets_train2, y_targets_eval_test2 = train_test_split(X_events2, y_targets2, test_size=0.2, random_state=42)

# create datasest
#train_dataset2 = pd.concat([X_events_train, y_targets_train], axis=1)
#eval_test_dataset2 = pd.concat([X_events_eval_test, y_targets_eval_test], axis=1)

In [None]:
#downsize only train set to handle data imbalance
y_targets_train_pos2 = y_targets_train2[y_targets_train2.apply(sum) > 0]
y_targets_train_neg2 = y_targets_train2[y_targets_train2.apply(sum) == 0]
y_targets_train_neg_sampled2 = y_targets_train_neg2.sample(frac=0.105, random_state=42)
y_targets_train_downsampled2 = pd.concat([y_targets_train_neg_sampled2, y_targets_train_pos2]).sample(frac=1, random_state=42)
X_events_train_downsampled2 = X_events_train2[y_targets_train_downsampled2.index]

# create datasest
#train_dataset_downsampled2 = pd.concat([X_events_train_downsampled, y_targets_train_downsampled], axis=1)

In [None]:
len(y_targets_train_downsampled2), len(y_targets_eval_test2), y_targets_train_downsampled2.apply(sum).sum(), y_targets_eval_test2.apply(sum).sum()


(77477, 171092, 15721, 4025)

In [None]:
15721/77477

0.2029118318984989

In [None]:
15721*5

78605

In [None]:
# split portion of evaluation / test into evaluation (10%) and test (10%) sets.
X_events_eval2, X_events_test2, y_targets_eval2, y_targets_test2 = train_test_split(X_events_eval_test2, y_targets_eval_test2, test_size=0.5, random_state=42)

# create datasest
#eval_dataset2 = pd.concat([X_events_eval, y_targets_eval], axis=1)
#test_dataset2 = pd.concat([X_events_test, y_targets_test], axis=1)


In [None]:
#check size of eval and test
len(y_targets_eval2), len(y_targets_test2), y_targets_eval2.apply(sum).sum(), y_targets_test2.apply(sum).sum()


(85546, 85546, 2126, 1899)

In [None]:
# Create datasets
train_dataset_downsampled2 = pd.concat([X_events_train_downsampled2, y_targets_train_downsampled2], axis=1)
train_dataset2 = pd.concat([X_events_train2, y_targets_train2], axis=1)  # Full train dataset (no downsampling)
eval_dataset2 = pd.concat([X_events_eval2, y_targets_eval2], axis=1)
test_dataset2 = pd.concat([X_events_test2, y_targets_test2], axis=1)

In [None]:
# Print sizes for verification
print("Train Dataset (Downsampled):", train_dataset_downsampled2.shape)
print("Train Dataset (Full):", train_dataset2.shape)
print("Eval Dataset:", eval_dataset2.shape)
print("Test Dataset:", test_dataset2.shape)

Train Dataset (Downsampled): (77477, 2)
Train Dataset (Full): (684368, 2)
Eval Dataset: (85546, 2)
Test Dataset: (85546, 2)


# **DataSet** / **DataLoader**

In [None]:
class AmlDataset(Dataset):
    def __init__(self, data, features):
        """
        @param data: pdf whose index is monotonically increases from 0
        @param features: list of features to be used in an event
        """
        self.data = data
        self.features = features

    def __getitem__(self, index):
        psr_sample = self.data.iloc[index] #retrieves row at specified index from the data.
        list_y = psr_sample["targets"]
        list_x = []

        for event in psr_sample["events"]:
          x = [event[feature] for feature in self.features] #create X
          list_x.append(x)

        #Converts the extracted features (list_x) and targets (list_y) to NumPy arrays, Returns a tuple (x, y).
        return np.array(list_x).astype(np.float32), np.array(list_y).astype(np.float32)

    def __len__(self):
        return len(self.data)

### a) Sampled Dataset (Traing Part)

In [None]:
# select some related features to be in training dataset
aml_dataset_sampled = AmlDataset(
    train_sampled_dataset,
     ['day_of_week_encoded', 'hour', 'transaction_type_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )
X, Y = aml_dataset_sampled.__getitem__(0) #call__getitem__ on the dataset object with index = 0.
X, Y, aml_dataset_sampled.__len__() #Calls __len__ to get total number of samples in the dataset.

(array([[ 5.0000000e+00,  3.0000000e+00,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01, -5.4420743e-02,
          1.8184268e+00],
        [ 5.0000000e+00,  3.0000000e+00,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01, -5.2621409e-02,
          1.8198484e+00],
        [ 5.0000000e+00,  9.0000000e+00,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01,  7.9164580e-02,
          1.8397599e+00],
        [ 5.0000000e+00,  9.0000000e+00,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01, -5.2868970e-02,
          1.8501354e+00],
        [ 5.0000000e+00,  1.0000000e+01,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01, -2.9024798e-02,
          1.8310381e+00],
        [ 5.0000000e+00,  1.1000000e+01,  1.0000000e+00,  1.0000000e+01,
          1.0000000e+01,  1.6000000e+01,  1.6000000e+01, -3.2381941

In [None]:
# define aml_dataloader --> for training dataset
aml_dataloader_sampled = DataLoader(
    aml_dataset_sampled,
    batch_size= None,
    shuffle=True,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_sampled

<torch.utils.data.dataloader.DataLoader at 0x7c72c681f3d0>

### b) Downsampled Negative Class (Training Part)
(Addressing Data Imbalance)

In [None]:
# select related features in traing dataset
aml_dataset_sampled_downsampled = AmlDataset(
    train_sampled_dataset_downsampled,
    ['day_of_week_encoded', 'hour', 'transaction_type_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_sampled_downsampled = DataLoader(
    aml_dataset_sampled_downsampled,
    batch_size= None,
    shuffle=True,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_sampled_downsampled

<torch.utils.data.dataloader.DataLoader at 0x7c72c681efe0>

### a+b) Both Sampled / Downsampling (Evaluation Part)

In [None]:
# select some related features to be in evaluation /testing dataset
aml_eval_dataset_sampled = AmlDataset(
    test_sampled_dataset,
     ['day_of_week_encoded', 'hour', 'transaction_type_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_eval_dataloader --> for evaluation / testing dataset
aml_eval_dataloader_sampled = DataLoader(
    aml_eval_dataset_sampled,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_sampled

<torch.utils.data.dataloader.DataLoader at 0x7c72c681e9b0>

### c) Sampled Dataset (with a.c.interaction) (downs.)

In [None]:
# select some related columns to be in training dataset
aml_dataset_sampled_id = AmlDataset(
    train_sampled_dataset_downsampled,   #<from sce.b>
     ['account_encoded','day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_sampled_id = DataLoader(
    aml_dataset_sampled_id,
    batch_size= None,
    shuffle=True,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False,
)

aml_dataloader_sampled_id

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7ff10>

In [None]:
# define aml_eval_dataloader --> for evaluation / testing dataset
aml_eval_dataset_sampled_id = AmlDataset(
    test_sampled_dataset,               #<from sce.b>
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_eval_dataloader_sampled_id = DataLoader(
    aml_eval_dataset_sampled_id,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_sampled_id

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7d360>

### d) Full Dataset (w/o a.c.interaction)

In [None]:
# select some related columns to be in training dataset
aml_dataset_noid = AmlDataset(
    train_dataset,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_noid = DataLoader(
    aml_dataset_noid,
    batch_size= None,
    shuffle=True,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_noid

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7d300>

In [None]:
# define aml_eval_dataloader --> for evaluation / testing dataset
aml_eval_dataset_noid = AmlDataset(
    test_dataset,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_eval_dataloader_noid = DataLoader(
    aml_eval_dataset_noid,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_noid

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7dd20>

### e) Full Dataset (with a.c.interaction)

In [None]:
# select some related columns to be in training dataset
aml_dataset_id = AmlDataset(
    train_dataset,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_id = DataLoader(
    aml_dataset_id,
    batch_size= None,
    shuffle=True,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_id

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7fc70>

In [None]:
# define aml_eval_dataloader --> for evaluation / testing dataset
aml_eval_dataset_id = AmlDataset(
    test_dataset,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_eval_dataloader_id = DataLoader(
    aml_eval_dataset_id,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_id

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7d270>

## f) Down negative class_Full Dataset (with a.c. interaction)
deal with imbalanced dataset

In [None]:
# train_dataset_downsampled = pd.concat([X_events_train_downsampled, y_targets_train_downsampled], axis=1)
# train_dataset = pd.concat([X_events_train, y_targets_train], axis=1)  # Full train dataset (no downsampling)
# eval_dataset = pd.concat([X_events_eval, y_targets_eval], axis=1)
# test_dataset = pd.concat([X_events_test, y_targets_test], axis=1)

In [None]:
# train_dataset_downsampled.to_parquet("/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/train_dataset_downsampled.parquet")


In [None]:
# select some related columns to be in training dataset (down neg. sample)
aml_dataset_id_downs2 = AmlDataset(
    train_dataset_downsampled2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

# define aml_dataloader --> for training dataset
aml_dataloader_id_downs2 = DataLoader(
    aml_dataset_id_downs2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    pin_memory=False, # if we have GPU, set pin_memory=True
    drop_last=False, # every sample in dataset is important, even if the final batch size varies. So will not drop it.
)

aml_dataloader_id_downs2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7fa90>

In [None]:
# eval_dataset.to_parquet("/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/eval_dataset.parquet")


In [None]:
# define aml_eval_dataloader --> for evaluation
aml_eval_dataset_id2 = AmlDataset(
    eval_dataset2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_eval_dataloader_id2 = DataLoader(
    aml_eval_dataset_id2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_eval_dataloader_id2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7d690>

In [None]:
# test_dataset.to_parquet("/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/test_dataset.parquet")


In [None]:
# define aml_eval_dataloader --> for testing dataset
aml_test_dataset_id2 = AmlDataset(
    test_dataset2,
     ['account_encoded', 'day_of_week_encoded', 'hour', 'transaction_type_encoded','account_interaction_encoded',
      'payment_currency_encoded', 'received_currency_encoded',
      'sender_bank_location_encoded', 'receiver_bank_location_encoded',
      'time_interval_normalized', 'amount_normalized'
      ]
    )

aml_test_dataloader_id2 = DataLoader(
    aml_test_dataset_id2,
    batch_size= None,
    shuffle=False,
    num_workers=0,
    drop_last=False,
    #collate_fn=lambda batch: custom_collate_fn(batch, max_history_length=50)  # Use the custom collate function
)

aml_test_dataloader_id2

<torch.utils.data.dataloader.DataLoader at 0x7c72c4e7e8c0>

# Save path of Dataset/DataLoader (for each scenario)

## a) Sampled Dataset

In [None]:
# for training part
#aml_dataset_sampled
#aml_dataloader_sampled

torch.save(aml_dataset_sampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_sampled.pth")
torch.save(aml_dataloader_sampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_sampled.pth")

print("saved!")

saved!


## b) Downsampled Negative Class

In [None]:
# for training part
#aml_dataset_sampled_downsampled
#aml_dataloader_sampled_downsampled



torch.save(aml_dataset_sampled_downsampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_sampled_downsampled.pth")
torch.save(aml_dataloader_sampled_downsampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_sampled_downsampled.pth")


## a+b) both scenarios use the same path for eval_part

In [None]:
# for evaluating both sampled & and downsampled dataset
#aml_eval_dataset_sampled
#aml_eval_dataloader_sampled


path_eval_dataset_sampled = torch.save(aml_eval_dataset_sampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_sampled.pth")
path_eval_dataloader_sampled = torch.save(aml_eval_dataloader_sampled, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_sampled.pth")


## c) Sampled Dataset (with a.c.interaction)

In [None]:
# for training part
#aml_dataset_sampled_id
#aml_dataloader_sampled_id

torch.save(aml_dataset_sampled_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_sampled_id.pth")
torch.save(aml_dataloader_sampled_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_sampled_id.pth")

In [None]:
# for evaluating part
#aml_eval_dataset_sampled_id
#aml_eval_dataloader_sampled_id

torch.save(aml_eval_dataset_sampled_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_sampled_id.pth")
torch.save(aml_eval_dataloader_sampled_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_sampled_id.pth")

## d) Full Dataset (w/o a.c.interaction)

In [None]:
# for training part
#aml_dataset_noid
#aml_dataloader_noid

torch.save(aml_dataset_noid, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_noid.pth")
torch.save(aml_dataloader_noid, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_noid.pth")

In [None]:
# for evaluating part
#aml_eval_dataset_noid
#aml_eval_dataloader_noid

torch.save(aml_eval_dataset_noid, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_noid.pth")
torch.save(aml_eval_dataloader_noid, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_noid.pth")

## e) Full Dataset (with a.c.interaction)

In [None]:
# for training part
#aml_dataset_id
#aml_dataloader_id

torch.save(aml_dataset_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_id.pth")
torch.save(aml_dataloader_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_id.pth")

In [None]:
# for evaluating part
#aml_eval_dataset_id
#aml_eval_dataloader_id

torch.save(aml_eval_dataset_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_id.pth")
torch.save(aml_eval_dataloader_id, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_id.pth")

## f) Down negative class_Full Dataset (with a.c. interaction)
deal with imbalanced dataset

1)

In [None]:
# for training part
#aml_dataset_id_downs2
#aml_dataloader_id_downs2

torch.save(aml_dataset_id_downs2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataset_id_downs2.pth")
torch.save(aml_dataloader_id_downs2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_dataloader_id_downs2.pth")

In [None]:
# for evaluating part
#aml_eval_dataset_id
#aml_eval_dataloader_id


torch.save(aml_eval_dataset_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataset_id2.pth")
torch.save(aml_eval_dataloader_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_eval_dataloader_id2.pth")

In [None]:
# for testing part
#aml_test_dataset_id1
#aml_test_dataloader_id1

torch.save(aml_test_dataset_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_test_dataset_id2.pth")
torch.save(aml_test_dataloader_id2, "/content/drive/MyDrive/DE/Master_Degree/3rd_Semester/Colab_Notebook/AML_file/aml_test_dataloader_id2.pth")