In [1]:
# Importing libraries

import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings

In [2]:
# Reading the file with the dataset using pd.read_csv

data = pd.read_csv(
    Path(r"Resources/Sample5.csv").absolute(),
    index_col="Timestamp", infer_datetime_format=True, parse_dates=True)

# Dropping the Unnamed, Account, Account.1 columns as they are not required for the analysis

df = data.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Account", "Account.1"])

# Reviewing the dataframe

df.head()


Unnamed: 0_level_0,From Bank,To Bank,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-09-04 18:45:00,220,45701,2316.73,Shekel,2316.73,Shekel,Cheque,0
2022-09-01 00:26:00,6129,6129,4.65,US Dollar,4.65,US Dollar,Reinvestment,0
2022-09-05 00:11:00,217959,221279,26.15,Euro,26.15,Euro,Credit Card,0
2022-09-02 07:41:00,24963,29435,1366.01,Euro,1366.01,Euro,Credit Card,0
2022-09-09 02:02:00,14011,13516,7442.38,Euro,7442.38,Euro,Cheque,0


In [3]:
# Checking the DataFrame data types

df.dtypes

From Bank               int64
To Bank                 int64
Amount Received       float64
Receiving Currency     object
Amount Paid           float64
Payment Currency       object
Payment Format         object
Is Laundering           int64
dtype: object

In [4]:
# Checking the null values

df.isnull().sum()


From Bank             0
To Bank               0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64

In [5]:
# Scaling the numerical data

data_scaled = StandardScaler().fit_transform(df[["Amount Received", "Amount Paid"]])

# Creating a DataFrame with the scaled data

df_scaled = pd.DataFrame(data_scaled, columns=["Amount Received", "Amount Paid"])

# Checking the DataFrame

df_scaled.head()

Unnamed: 0,Amount Received,Amount Paid
0,-0.010645,-0.010576
1,-0.010646,-0.010577
2,-0.010646,-0.010577
3,-0.010645,-0.010577
4,-0.010644,-0.010576


In [6]:
# Transforming rest of the columns using get_dummies()

df_encoded = pd.get_dummies(df, columns=["From Bank", "To Bank", "Receiving Currency", "Payment Currency", "Payment Format"])

# Adjusting the dataframe

df_encoded.drop(columns=["Amount Received", "Amount Paid"], inplace=True)
df_encoded.reset_index(inplace=True)

# Checking the DataFrame

df_encoded.head()


Unnamed: 0,Timestamp,Is Laundering,From Bank_1,From Bank_3,From Bank_4,From Bank_5,From Bank_6,From Bank_7,From Bank_8,From Bank_9,...,Payment Currency_US Dollar,Payment Currency_Yen,Payment Currency_Yuan,Payment Format_ACH,Payment Format_Bitcoin,Payment Format_Cash,Payment Format_Cheque,Payment Format_Credit Card,Payment Format_Reinvestment,Payment Format_Wire
0,2022-09-04 18:45:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2022-09-01 00:26:00,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,2022-09-05 00:11:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2022-09-02 07:41:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2022-09-09 02:02:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# Concatenating the two DataFrames

df_transformed = pd.concat([df_scaled, df_encoded], axis=1)

# Setting index

df_transformed.set_index("Timestamp", inplace=True)

# Checking the DataFrame

df_transformed.head()


Unnamed: 0_level_0,Amount Received,Amount Paid,Is Laundering,From Bank_1,From Bank_3,From Bank_4,From Bank_5,From Bank_6,From Bank_7,From Bank_8,...,Payment Currency_US Dollar,Payment Currency_Yen,Payment Currency_Yuan,Payment Format_ACH,Payment Format_Bitcoin,Payment Format_Cash,Payment Format_Cheque,Payment Format_Credit Card,Payment Format_Reinvestment,Payment Format_Wire
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-04 18:45:00,-0.010645,-0.010576,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2022-09-01 00:26:00,-0.010646,-0.010577,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2022-09-05 00:11:00,-0.010646,-0.010577,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2022-09-02 07:41:00,-0.010645,-0.010577,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2022-09-09 02:02:00,-0.010644,-0.010576,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
# Using the elbow method to find the best value for K

inertia = []
k = list(range(1, 11))

# Ignoring warnings

warnings.simplefilter("ignore")

# Calculating the inertia for the range of K values

for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df_transformed)
    inertia.append(k_model.inertia_)


In [9]:
# Creating the elbow curve

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia",
    xlabel = "K", ylabel = "Inertia", title="Elbow Curve", xticks=k)


In [10]:
# Running K-Means with k=3

model = KMeans(n_clusters=3, random_state= 1)
model.fit(df_transformed)

k_3_clusters = model.predict(df_transformed)
predictions_df = df_transformed.copy()

predictions_df["K-Means Cluster"] = k_3_clusters

In [11]:
# Plotting the clusters

predictions_df.hvplot.scatter(
    x="Amount Received",
    y="Amount Paid",
    by="K-Means Cluster",
    hover_cols=["Receiving Currency", "Payment Currency", "Payment Format", "Is Laundering"],
    title="K-Means Clustering with K=3",
    width=800,
)


In [12]:
# Grouping by cluster and plotting the sum 

predictions_groupby = predictions_df.groupby("K-Means Cluster").sum()

predictions_groupby.hvplot.bar(
    y=["Amount Received", "Amount Paid"],
    title="Total Amount Received and Paid by Cluster",
    width=800,
)


In [13]:
# Checking for which cluster has most laundering transactions

predictions_df.groupby(by=["K-Means Cluster"])["Is Laundering"].value_counts()


K-Means Cluster  Is Laundering
0                0                6404
                 1                  11
1                0                3735
                 1                   6
2                0                   1
Name: Is Laundering, dtype: int64

In [14]:
# Checking which payment currency was used for laundering money

df.groupby(by=["Is Laundering"])["Payment Currency"].value_counts()


Is Laundering  Payment Currency 
0              US Dollar            3732
               Euro                 2375
               Swiss Franc           493
               Yuan                  454
               Shekel                402
               UK Pound              359
               Rupee                 358
               Ruble                 315
               Yen                   306
               Bitcoin               297
               Canadian Dollar       279
               Australian Dollar     264
               Mexican Peso          210
               Saudi Riyal           163
               Brazil Real           133
1              US Dollar               6
               Euro                    5
               Australian Dollar       1
               Canadian Dollar         1
               Mexican Peso            1
               Rupee                   1
               Saudi Riyal             1
               Swiss Franc             1
Name: Payment Currency, 

In [15]:
# Clustering using Agglomerative Clustering

aggo_model = AgglomerativeClustering(n_clusters=3)

aggo_predictions = aggo_model.fit_predict(df_transformed)


In [16]:
# Adding the predictions to the DataFrame

predictions_df["Agglomerative Cluster"] = aggo_predictions


In [17]:
# Clustering using Birch

birch_model = Birch(n_clusters=3)

birch_predictions = birch_model.fit_predict(df_transformed)


In [18]:
# Adding the Birch predictions to the DataFrame

predictions_df["Birch Cluster"] = birch_predictions


In [19]:
# Plotting the Agglomerative cluster

predictions_df.hvplot.scatter(
    x="Amount Received",
    y="Amount Paid",
    by="Agglomerative Cluster",
    hover_cols=["Is Laundering"],
    title="Agglomerative Clustering with K=3",
    width=800
)

In [20]:
# Plotting the Birch cluster

predictions_df.hvplot.scatter(
    x="Amount Received",
    y="Amount Paid",
    by="Birch Cluster",
    hover_cols=["Is Laundering"],
    title="Birch Clustering with K=3",
    width=800
)

In [21]:
# Correlation between the clusters

predictions_df[["K-Means Cluster", "Agglomerative Cluster", "Birch Cluster"]].corr()


Unnamed: 0,K-Means Cluster,Agglomerative Cluster,Birch Cluster
K-Means Cluster,1.0,0.985909,0.985896
Agglomerative Cluster,0.985909,1.0,0.997898
Birch Cluster,0.985896,0.997898,1.0


In [26]:
# Running PCA model

pca = PCA(n_components=3)

pca_df = pd.DataFrame(pca.fit_transform(df_transformed), columns=["PCA1", "PCA2", "PCA3"])

pca_df.head()


Unnamed: 0,PCA1,PCA2,PCA3
0,-0.008244,0.246568,-0.632452
1,-0.020564,-0.941052,0.159682
2,-0.019045,1.047067,0.66953
3,-0.019047,1.047422,0.670578
4,-0.011304,1.039289,-0.716875


In [29]:
# Running K-Means with k=3 on the PCA DataFrame

pca_model_k3 = KMeans(n_clusters=3, random_state= 1)

pca_model_k3.fit(pca_df)

pca_k_3_clusters = pca_model_k3.predict(pca_df)

pca_predictions_df = pca_df.copy()

# Adding the cluster results to the DataFrame

pca_predictions_df["K-Means Cluster"] = pca_k_3_clusters


In [30]:
# Plotting the PCA model with K-Means clusters

pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="K-Means Cluster",
    hover_cols=["Receiving Currency", "Payment Currency", "Payment Format", "Is Laundering"],
    title="K-Means Clustering with K=3 on PCA DataFrame",
    width=800,
)


In [27]:
# Running ICA model

ica = FastICA(n_components=3)

ica_df = pd.DataFrame(ica.fit_transform(df_transformed), columns=["ICA1", "ICA2", "ICA3"])

ica_df.head()


Unnamed: 0,ICA1,ICA2,ICA3
0,0.312329,-0.009139,1.129878
1,-1.223646,-0.011147,-0.294287
2,1.373281,-0.013766,-1.182804
3,1.373758,-0.013765,-1.184669
4,1.34377,-0.013954,1.28846


In [31]:
# Running K-Means with k=3 on the ICA DataFrame

ica_model = KMeans(n_clusters=3, random_state= 1)

ica_model.fit(ica_df)

ica_k_3_clusters = ica_model.predict(ica_df)

# Adding the cluster results to the DataFrame

ica_predictions_df = ica_df.copy()

ica_predictions_df["K-Means Cluster"] = ica_k_3_clusters


In [32]:
# Plotting the ICA model with K-Means clusters

ica_predictions_df.hvplot.scatter(
    x="ICA1",
    y="ICA2",
    by="K-Means Cluster",
    hover_cols=["Receiving Currency", "Payment Currency", "Payment Format", "Is Laundering"],
    title="K-Means Clustering with K=3 on ICA DataFrame",
    width=800,
)


In [33]:
# Running calinski_harabasz_score to evaluate the K-Means model

metrics.calinski_harabasz_score(df_transformed, k_3_clusters)

3372.819503394582

In [34]:
# Running calinski_harabasz_score to evaluate the Agglomerative model

metrics.calinski_harabasz_score(df_transformed, aggo_predictions)

3365.362937367505

In [35]:
# Running calinski_harabasz_score to evaluate the Birch model

metrics.calinski_harabasz_score(df_transformed, birch_predictions)

3366.2524242109084