In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(r'C:\Users\diki.rustian\Documents\GitHub\ppatk_money_laundering\data\raw\SAML-D.csv')

# Display basic information about the DataFrame
print("Shape of the DataFrame:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)

Shape of the DataFrame: (9504852, 12)

First 5 rows:
       Time        Date  Sender_account  Receiver_account    Amount  \
0  10:35:19  2022-10-07      8724731955        2769355426   1459.15   
1  10:35:20  2022-10-07      1491989064        8401255335   6019.64   
2  10:35:20  2022-10-07       287305149        4404767002  14328.44   
3  10:35:21  2022-10-07      5376652437        9600420220  11895.00   
4  10:35:21  2022-10-07      9614186178        3803336972    115.25   

  Payment_currency Received_currency Sender_bank_location  \
0        UK pounds         UK pounds                   UK   
1        UK pounds            Dirham                   UK   
2        UK pounds         UK pounds                   UK   
3        UK pounds         UK pounds                   UK   
4        UK pounds         UK pounds                   UK   

  Receiver_bank_location  Payment_type  Is_laundering       Laundering_type  
0                     UK  Cash Deposit              0  Normal_Cash_Deposits

In [2]:
# Filter DataFrame untuk transaksi money laundering saja (Is_laundering = 1)
df_laundering = df[df['Is_laundering'] == 1]

# Display informasi DataFrame yang sudah difilter
print("Shape of laundering DataFrame:", df_laundering.shape)
print(f"Percentage of laundering transactions: {len(df_laundering)/len(df)*100:.2f}%")
print("\nFirst 5 laundering transactions:")
print(df_laundering.head())
print("\nLaundering types distribution:")
print(df_laundering['Laundering_type'].value_counts())

Shape of laundering DataFrame: (9873, 12)
Percentage of laundering transactions: 0.10%

First 5 laundering transactions:
          Time        Date  Sender_account  Receiver_account   Amount  \
317   10:46:37  2022-10-07      7401327478        4336451277  2603.30   
1206  11:19:32  2022-10-07      6340007440        4316483340   106.04   
1649  11:35:42  2022-10-07      3758118046        6081504025  5903.33   
1954  11:45:53  2022-10-07       445154846        4497771501  3957.87   
3304  12:39:03  2022-10-07      2758469152        2987279234  7832.41   

     Payment_currency Received_currency Sender_bank_location  \
317         UK pounds         UK pounds                   UK   
1206        UK pounds      Indian rupee                   UK   
1649        UK pounds      Mexican Peso                   UK   
1954        UK pounds             Naira                   UK   
3304        UK pounds         UK pounds                   UK   

     Receiver_bank_location     Payment_type  Is_launde

In [3]:
# Ambil 9873 data normal transactions (Is_laundering = 0) secara random
df_normal = df[df['Is_laundering'] == 0].sample(n=9873, random_state=42)

# Gabungkan data laundering dan normal transactions
df_balanced = pd.concat([df_laundering, df_normal], ignore_index=True)

# Shuffle data untuk mengacak urutan
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display informasi dataset yang sudah balanced
print("Shape of balanced DataFrame:", df_balanced.shape)
print("\nDistribution of Is_laundering:")
print(df_balanced['Is_laundering'].value_counts())
print(f"\nPercentage distribution:")
print(df_balanced['Is_laundering'].value_counts(normalize=True) * 100)
print("\nFirst 5 rows of balanced dataset:")
print(df_balanced.head())

Shape of balanced DataFrame: (19746, 12)

Distribution of Is_laundering:
Is_laundering
1    9873
0    9873
Name: count, dtype: int64

Percentage distribution:
Is_laundering
1    50.0
0    50.0
Name: proportion, dtype: float64

First 5 rows of balanced dataset:
       Time        Date  Sender_account  Receiver_account   Amount  \
0  07:40:08  2023-07-14      3187084517        8846937355  3788.78   
1  22:04:14  2022-10-29      4851027632        9832274459  7377.62   
2  16:56:30  2023-02-06      5811105590        6929936460  7435.30   
3  22:40:35  2023-01-09      4349340373         386440627  7057.37   
4  19:53:44  2023-01-13      5386067899        8286125964  7443.71   

  Payment_currency Received_currency Sender_bank_location  \
0        UK pounds         UK pounds                   UK   
1        UK pounds              Euro                   UK   
2        UK pounds   Moroccan dirham                   UK   
3        UK pounds              Euro                   UK   
4        UK p

In [4]:
# Export balanced dataset ke CSV file di folder results
output_path = r'C:\Users\diki.rustian\Documents\GitHub\ppatk_money_laundering\data\results\SAML-D Balanced.csv'

# Simpan ke CSV
df_balanced.to_csv(output_path, index=False)

print(f"Dataset balanced berhasil disimpan ke: {output_path}")
print(f"Jumlah baris yang disimpan: {len(df_balanced)}")
print(f"Jumlah kolom yang disimpan: {len(df_balanced.columns)}")

# Verifikasi file tersimpan
import os
if os.path.exists(output_path):
    file_size = os.path.getsize(output_path)
    print(f"Ukuran file: {file_size / (1024*1024):.2f} MB")
else:
    print("Error: File tidak berhasil disimpan!")

Dataset balanced berhasil disimpan ke: C:\Users\diki.rustian\Documents\GitHub\ppatk_money_laundering\data\results\SAML-D Balanced.csv
Jumlah baris yang disimpan: 19746
Jumlah kolom yang disimpan: 12
Ukuran file: 1.99 MB


In [5]:
# Buat dataset reduced balanced dengan total 1000 rows (500 laundering + 500 normal)
# Ambil 500 data laundering secara random
df_laundering_reduced = df_laundering.sample(n=500, random_state=42)

# Ambil 500 data normal secara random
df_normal_reduced = df[df['Is_laundering'] == 0].sample(n=500, random_state=42)

# Gabungkan data laundering dan normal
df_reduced_balanced = pd.concat([df_laundering_reduced, df_normal_reduced], ignore_index=True)

# Shuffle data untuk mengacak urutan
df_reduced_balanced = df_reduced_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display informasi dataset reduced
print("Shape of reduced balanced DataFrame:", df_reduced_balanced.shape)
print("\nDistribution of Is_laundering:")
print(df_reduced_balanced['Is_laundering'].value_counts())
print(f"\nPercentage distribution:")
print(df_reduced_balanced['Is_laundering'].value_counts(normalize=True) * 100)

# Export ke CSV file
output_path_reduced = r'C:\Users\diki.rustian\Documents\GitHub\ppatk_money_laundering\data\results\SAML-D reduced balanced.csv'
df_reduced_balanced.to_csv(output_path_reduced, index=False)

print(f"\nDataset reduced balanced berhasil disimpan ke: {output_path_reduced}")
print(f"Jumlah baris yang disimpan: {len(df_reduced_balanced)}")
print(f"Jumlah kolom yang disimpan: {len(df_reduced_balanced.columns)}")

# Verifikasi file tersimpan
if os.path.exists(output_path_reduced):
    file_size_reduced = os.path.getsize(output_path_reduced)
    print(f"Ukuran file: {file_size_reduced / 1024:.2f} KB")
else:
    print("Error: File tidak berhasil disimpan!")

Shape of reduced balanced DataFrame: (1000, 12)

Distribution of Is_laundering:
Is_laundering
0    500
1    500
Name: count, dtype: int64

Percentage distribution:
Is_laundering
0    50.0
1    50.0
Name: proportion, dtype: float64

Dataset reduced balanced berhasil disimpan ke: C:\Users\diki.rustian\Documents\GitHub\ppatk_money_laundering\data\results\SAML-D reduced balanced.csv
Jumlah baris yang disimpan: 1000
Jumlah kolom yang disimpan: 12
Ukuran file: 103.56 KB
