In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# max rows and columns to display
pd.set_option('display.max_rows', 500)

In [2]:
df = pd.read_csv('simulated_transaction_2024.csv')

# convert df 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

In [3]:
df.head()

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
0,2023-01-01,00:00,678330503.0,2971.0,1584.0,,Westport Care Home
1,2023-01-01,00:00,472213568.0,3792.0,1950.0,,Barbiee Boutique
2,2023-01-01,00:00,472213568.0,3012.0,-780.0,283027736.0,
3,2023-01-01,00:00,283027736.0,1787.0,780.0,472213568.0,
4,2023-01-01,00:00,624500124.0,3226.0,1825.0,,Fat Face


## Missing balance EDA

In [4]:
# null rows with balance missing
bank_balance_missing = df[df['Balance'].isnull()]
bank_balance_missing.head(60)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
1392,2023-01-01,00:00,246753533.0,,-100.0,,Grand Union BJJ
4132,2023-01-01,00:00,726913482.0,,-5.99,,Mojang Studios
4593,2023-01-01,00:00,335698481.0,,-18.99,,PureGym
4771,2023-01-01,09:49,706250160.0,,-60.92,,The Works
4799,2023-01-01,10:01,647472692.0,,-27.96,,The Works
5628,2023-01-01,16:51,903468956.0,,-9.5,,The Crown
8364,2023-01-06,16:50,412823730.0,,-66.99,,Sports Direct
8860,2023-01-07,12:44,395284483.0,,-65.42,,Tesco
9290,2023-01-08,11:23,639549973.0,,-122.5,,Topshop
11956,2023-01-13,09:55,552980092.0,,-375.9,,Fat Face


- Some missing balances from personal and merchant transactions.

In [5]:
# transactions for account no = 246753533.0
bank_user1 = df[df['Account No'] == 246753533.0]
bank_user1.shape

bank_user1.head(20)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
553,2023-01-01,00:00,246753533.0,1246.0,-676.0,,Halifax
1387,2023-01-01,00:00,246753533.0,1230.01,-15.99,,Netflix
1388,2023-01-01,00:00,246753533.0,1222.02,-7.99,,Disney
1389,2023-01-01,00:00,246753533.0,1207.03,-14.99,,Blizzard
1390,2023-01-01,00:00,246753533.0,1192.04,-14.99,,Blizzard
1391,2023-01-01,00:00,246753533.0,1177.05,-14.99,,Blizzard
1392,2023-01-01,00:00,246753533.0,,-100.0,,Grand Union BJJ
5110,2023-01-01,11:49,246753533.0,1073.55,-3.5,,Starbucks
5805,2023-01-02,08:31,246753533.0,1036.2435,-37.31,,Tesco
5998,2023-01-02,10:15,246753533.0,1031.0435,-5.2,,Starbucks


 - Balance[i] = Balance[i-1] + Amount[i]

In [6]:
# example of bank balance missing for personal transaction
bank_user2_personal = df[(df['Account No'] == 240643705.0) & (df['Third Party Account No'] == 895750759.0)]

bank_user2_personal.shape

bank_user2_personal.head(20)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
10820,2023-01-11,00:45,240643705.0,164.56915,310.0,895750759.0,
20696,2023-01-30,23:59,240643705.0,1853.00645,-960.0,895750759.0,
38378,2023-02-27,23:59,240643705.0,,-960.0,895750759.0,
56606,2023-03-30,23:59,240643705.0,1716.997731,-960.0,895750759.0,
74458,2023-04-29,23:59,240643705.0,1320.477481,-960.0,895750759.0,
82848,2023-05-06,05:04,240643705.0,25.06566,70.0,895750759.0,
83645,2023-05-07,12:06,240643705.0,-67.04324,40.0,895750759.0,
93500,2023-05-30,23:59,240643705.0,1770.57846,-960.0,895750759.0,
102581,2023-06-07,09:22,240643705.0,1.52021,-40.0,895750759.0,
113000,2023-06-29,23:59,240643705.0,1811.43106,-960.0,895750759.0,


In [7]:
# get transactions for account no = 240643705.0 in february only
bank_user2_feb = df[(df['Account No'] == 240643705.0) & (df['Date'].dt.month == 2)]

print(bank_user2_feb.shape)

bank_user2_feb.head(20)

(20, 7)


Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
22279,2023-02-01,00:00,240643705.0,735.04645,-998.0,,LBG
26545,2023-02-01,09:30,240643705.0,527.12645,-36.97,,Brilliant Brushes
26575,2023-02-01,09:46,240643705.0,415.15645,-111.97,,Sports Direct
27682,2023-02-03,09:49,240643705.0,337.17645,-77.98,,Sports Direct
27772,2023-02-03,12:07,240643705.0,178.17645,-159.0,,Topshop
27778,2023-02-03,12:39,240643705.0,61.17645,-117.0,,Matalan
29413,2023-02-06,11:27,240643705.0,-96.9273,-158.1,,Tesco
36629,2023-02-23,09:19,240643705.0,-151.92945,-55.0,,Tesco
38377,2023-02-28,00:00,240643705.0,2766.07055,2918.0,,Remedy plus care
38378,2023-02-27,23:59,240643705.0,,-960.0,895750759.0,


- Again even for personal transactions, the missing balance = previous balance + current transaction amount.

## Balance Imputation Code
 

In [8]:
bank = df.copy()

for i in range(len(bank)):
    
    if pd.isnull(bank.loc[i,'Balance']):
        bank.loc[i,'Balance'] = bank.loc[i-1,'Balance'] + bank.loc[i,'Amount']

In [9]:
# check nulls in bank df
bank.isnull().sum()

Date                         229
Timestamp                    251
Account No                   224
Balance                        0
Amount                       209
Third Party Account No    223764
Third Party Name            7079
dtype: int64