In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

#### READING DATA

In [2]:
data = pd.read_csv("/Users/daniyahammarah/Downloads/fake_transactional_data_24.csv")

#### RENAMING COLUMNS


In [3]:
data.rename(columns = {'from_totally_fake_account':'Sender_Account_Num', 'monopoly_money_amount':'Amount', 
                              'to_randomly_generated_account':'Receiver', 'not_happened_yet_date':'Transaction_date'}, inplace = True)


#### DATE COLUMN TO DATE TYPE


In [4]:
data["Transaction_date"] = pd.to_datetime(data.Transaction_date,format="%d/%m/%Y")
data["Transaction_Date"] = data['Transaction_date'].dt.strftime('%m/%d/%Y')
print(data['Transaction_date'])


0          2025-01-01
1          2025-01-01
2          2025-01-01
3          2025-01-01
4          2025-01-01
              ...    
10148275   2025-12-31
10148276   2025-12-31
10148277   2025-12-31
10148278   2025-12-31
10148279   2025-12-31
Name: Transaction_date, Length: 10148280, dtype: datetime64[ns]


#### EXTRACTING CALENDAR DETAILS


In [5]:
data['Day_of_Week'] = data['Transaction_date'].dt.dayofweek
data['Month'] = data['Transaction_date'].dt.strftime('%B')
data['Week_of_Month'] = data['Transaction_date'].dt.day // 7 + 1
data

Unnamed: 0,Sender_Account_Num,Amount,Receiver,Transaction_date,Transaction_Date,Day_of_Week,Month,Week_of_Month
0,10371.0,4.00,CINEMA,2025-01-01,01/01/2025,2,January,1
1,88339.0,2.40,40544,2025-01-01,01/01/2025,2,January,1
2,18555.0,2.40,85149,2025-01-01,01/01/2025,2,January,1
3,18555.0,4.10,HIPSTER_COFFEE_SHOP,2025-01-01,01/01/2025,2,January,1
4,80792.0,1.95,18555,2025-01-01,01/01/2025,2,January,1
...,...,...,...,...,...,...,...,...
10148275,32185.0,2.65,COFFEE_SHOP,2025-12-31,12/31/2025,2,December,5
10148276,32185.0,2.45,COFFEE_SHOP,2025-12-31,12/31/2025,2,December,5
10148277,57569.0,14.00,WINE_BAR,2025-12-31,12/31/2025,2,December,5
10148278,1490.0,9.00,57569,2025-12-31,12/31/2025,2,December,5


#### SENDER IN INTEGER TYPE AND PULLING LENGTH OF ACCNT NUMBERS


In [6]:
data['Sender_Account_Num'] = data['Sender_Account_Num'].astype(int)

data['Sender_Account_Num_Length'] = data['Sender_Account_Num'].astype(str).apply(lambda x: len(x))

unique_lengths = data['Sender_Account_Num_Length'].unique()
print(unique_lengths)


[5 4 6]


#### RECEIVER IN INTEGER TYPE, RECEIVER NAME IN ANOTHER COL AND PULLING LENGTH OF ACCNT NUMBERS


In [7]:
data['Receiver_Account_Num'] = pd.to_numeric(data['Receiver'], errors='coerce')
mask = data['Receiver_Account_Num'].isna()
data.loc[mask, 'Receiver_Name'] = data.loc[mask, 'Receiver']

data['Receiver_Account_Num'] = data['Receiver_Account_Num'].fillna(0)

data['Receiver_Account_Num'] = data['Receiver_Account_Num'].astype(int)

data['Receiver_Account_Num_Length'] = data['Receiver_Account_Num'].astype(str).apply(lambda x: len(x))
data['Receiver_Name'] = data['Receiver_Name'].astype(str)


#### SEEING THE RECEIVER NAMES LIST IN A CSV FILE


In [8]:
receiver_names = data['Receiver_Name'].drop_duplicates()
receiver_names.to_csv('receiver_names.csv', index=False)

receiver_nums = data['Receiver_Account_Num'].drop_duplicates()
receiver_nums.to_csv('receiver_nums.csv', index=False)

#### CHECK TYPE OF DATA AND NULLS


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10148280 entries, 0 to 10148279
Data columns (total 12 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   Sender_Account_Num           int64         
 1   Amount                       float64       
 2   Receiver                     object        
 3   Transaction_date             datetime64[ns]
 4   Transaction_Date             object        
 5   Day_of_Week                  int32         
 6   Month                        object        
 7   Week_of_Month                int64         
 8   Sender_Account_Num_Length    int64         
 9   Receiver_Account_Num         int64         
 10  Receiver_Name                object        
 11  Receiver_Account_Num_Length  int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(5), object(4)
memory usage: 890.4+ MB


In [10]:
data.nunique()

Sender_Account_Num              8142
Amount                          5236
Receiver                       14300
Transaction_date                 364
Transaction_Date                 364
Day_of_Week                        7
Month                             12
Week_of_Month                      5
Sender_Account_Num_Length          3
Receiver_Account_Num           14222
Receiver_Name                     80
Receiver_Account_Num_Length        4
dtype: int64

#### CREATING A TABLE WITH DUPLICATE ROWS SUMMED / no. of transactions reduce/ not sure if correct


In [11]:
data['is_duplicate'] = data.duplicated(subset=['Sender_Account_Num', 'Receiver_Account_Num', 'Receiver_Name', 'Transaction_Date', 'Amount'], keep=False)

data_check = data
data_check.to_csv('data_check.csv', index=False)

summed_data = data.groupby(['Sender_Account_Num', 'Receiver_Account_Num', 'Receiver_Name','Transaction_Date', 'is_duplicate'], as_index=False)['Amount'].sum()

summed_data.to_csv('summed_data.csv', index=False)


#### CHECKING HOW MANY ACCOUNTS ARE SENDERS AND RECEIVERS IN THE DATA


In [12]:
unique_senders = set(data['Sender_Account_Num'])
unique_receivers = set(data['Receiver_Account_Num'])

common_account_numbers = unique_senders.intersection(unique_receivers)

num_common_account_numbers = len(common_account_numbers)

print("Number of sender account numbers also present as receiver account numbers:", num_common_account_numbers) 
#out of 8142 accnt nums, 8127 are senders and receievrs 


Number of sender account numbers also present as receiver account numbers: 8127


#### SEPARATE INFLOW, OUTFLOW TABLES FOR ACCOUNTS AND BUSINESSES


In [13]:
inflow_account = data[[ 'Transaction_date', 'Receiver_Account_Num','Amount']]
inflow_account = inflow_account[inflow_account['Receiver_Account_Num'] != 0]

inflow_business = data[[ 'Transaction_date', 'Receiver_Name','Amount']]
inflow_business = inflow_business.dropna(subset=['Receiver_Name'])

outflow = data[[ 'Transaction_date', 'Sender_Account_Num','Amount']]
