# DBSCANS

Prepare data for DBSCAN 

In [6]:
import pandas as pd
import numpy as np

In [7]:
# read data from csv file
fake_transaction = pd.read_csv('fake_transactional_data_24.csv')

In [8]:
aggregated_data = fake_transaction.groupby('from_totally_fake_account')['monopoly_money_amount'].sum().reset_index()
aggregated_data

Unnamed: 0,from_totally_fake_account,monopoly_money_amount
0,1000.0,28698.51
1,1002.0,15622.97
2,1006.0,18778.19
3,1018.0,15612.21
4,1053.0,19444.57
...,...,...
8137,100931.0,22332.81
8138,100934.0,18579.25
8139,100947.0,20783.20
8140,100986.0,15314.23


In [9]:
# filter out non-numeric values from the 'monopoly_money_amount' column
fake_transaction['monopoly_money_amount'] = pd.to_numeric(fake_transaction['monopoly_money_amount'], errors='coerce')

# drop rows with NaN values in 'monopoly_money_amount'
fake_transaction.dropna(subset=['monopoly_money_amount'], inplace=True)

# group by 'from_totally_fake_account' and 'to_randomly_generated_account' and calculate the sum
agg_result = fake_transaction.groupby(['from_totally_fake_account', 'to_randomly_generated_account'])['monopoly_money_amount'].sum()

# reset the index to make the groupby result a DataFrame
agg_result = agg_result.reset_index()

# create a function to get the top 3 values for each group
def get_top_3(group):
    return group.nlargest(3, 'monopoly_money_amount')

# group by 'from_totally_fake_account' and apply the function to get the top 3 values
top_3_values = agg_result.groupby('from_totally_fake_account').apply(get_top_3)

# reset the index to make 'from_totally_fake_account' a regular column instead of an index
top_3_values.reset_index(drop=True, inplace=True)  # Drop the existing index

# pivot
top_3_pivot = top_3_values.pivot_table(index='from_totally_fake_account', columns=top_3_values.groupby('from_totally_fake_account').cumcount() + 1, values='monopoly_money_amount')

# rename the columns
top_3_pivot.columns = [f'top_{col}' for col in top_3_pivot.columns]

print(top_3_pivot)


                              top_1    top_2    top_3
from_totally_fake_account                            
1000.0                     16130.00  1316.00   850.50
1002.0                      4126.75  1489.42  1284.00
1006.0                      2603.15  1610.28  1429.71
1018.0                      3564.00  1148.00  1038.00
1053.0                      4507.00  1723.00  1295.85
...                             ...      ...      ...
100931.0                    4103.65  2016.59  1751.00
100934.0                    3414.50  1906.45  1658.14
100947.0                    2396.50  1901.31  1460.90
100986.0                    1841.74  1520.87  1236.50
100988.0                    3565.50  1262.41  1167.00

[8142 rows x 3 columns]


In [10]:
# function to get the corresponding 'to_randomly_generated_account' for each top spending amount
def get_top_account(top_value, agg_result):
    # get the 'to_randomly_generated_account' corresponding to the top spending amount
    top_account = agg_result.loc[agg_result['monopoly_money_amount'] == top_value, 'to_randomly_generated_account'].values[0]
    return top_account

# mapping the top spending amounts to the corresponding 'to_randomly_generated_account' for each top
top_3_pivot['top_1_account'] = top_3_pivot['top_1'].apply(lambda x: get_top_account(x, agg_result))
top_3_pivot['top_2_account'] = top_3_pivot['top_2'].apply(lambda x: get_top_account(x, agg_result))
top_3_pivot['top_3_account'] = top_3_pivot['top_3'].apply(lambda x: get_top_account(x, agg_result))

print(top_3_pivot)


                              top_1    top_2    top_3 top_1_account  \
from_totally_fake_account                                             
1000.0                     16130.00  1316.00   850.50         31489   
1002.0                      4126.75  1489.42  1284.00         23010   
1006.0                      2603.15  1610.28  1429.71         80751   
1018.0                      3564.00  1148.00  1038.00         11822   
1053.0                      4507.00  1723.00  1295.85         57069   
...                             ...      ...      ...           ...   
100931.0                    4103.65  2016.59  1751.00         88568   
100934.0                    3414.50  1906.45  1658.14         82585   
100947.0                    2396.50  1901.31  1460.90         48039   
100986.0                    1841.74  1520.87  1236.50           PUB   
100988.0                    3565.50  1262.41  1167.00         51576   

                                 top_2_account    top_3_account  
from_total