In [44]:
import pandas as pd
import numpy as np
import seaborn as sns

# set max rows displayed
pd.set_option('display.max_rows', 300)

In [2]:
# import 2nd dataset from lloyds
bank = pd.read_csv('simulated_transaction_2024.csv')

In [33]:
bank.head(10)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
0,01/01/2023,00:00,678330503.0,2971.0,1584.0,,Westport Care Home
1,01/01/2023,00:00,472213568.0,3792.0,1950.0,,Barbiee Boutique
2,01/01/2023,00:00,472213568.0,3012.0,-780.0,283027736.0,
3,01/01/2023,00:00,283027736.0,1787.0,780.0,472213568.0,
4,01/01/2023,00:00,624500124.0,3226.0,1825.0,,Fat Face
5,01/01/2023,00:00,203466392.0,4607.66,2841.66,,Lavender Primary
6,01/01/2023,00:00,768271776.0,3620.0,1950.0,,A Cut Above
7,01/01/2023,00:00,768271776.0,2840.0,-780.0,215404070.0,
8,01/01/2023,00:00,215404070.0,1965.0,780.0,768271776.0,
9,01/01/2023,00:00,456221621.0,2831.0,1675.0,,Tesco


## Investigate Categories

In [9]:
merchants = bank['Third Party Name'].unique()
print(merchants)

# print number of unique merchants
print('Number of unique merchants: ', len(merchants))

['Westport Care Home' 'Barbiee Boutique' nan 'Fat Face' 'Lavender Primary'
 'A Cut Above' 'Tesco' 'Brilliant Brushes' 'Costa Coffee'
 'Premier Finance' 'North Face' 'Craftastic' 'Fitted Stitch' 'Selfridges'
 'Starbucks' 'The Crown' 'Cass Art' 'Green Park Academy' 'Rose & Crown'
 'Kings Arms' 'Sunny Care Nursery' 'A Yarn Story' 'Pets Corner'
 'Lavender Fields' 'Victoria Park' 'Halifax' 'LBG' 'Blizzard' 'Xbox'
 'Mojang Studios' 'PureGym' 'Disney' 'Netflix' 'Grand Union BJJ' 'Amazon'
 'SquareOnix' 'Deliveroo' 'JustEat' 'Coop Local' 'Sainsbury' 'AMAZON'
 'The Works' "Blackwell's" 'Topshop' 'Matalan' 'Foyles' 'Wool'
 'Hobby Lobby' 'Revella' 'Sainsbury Local' 'Loosely Fitted'
 'Stitch By Stitch' 'Coffee #1' 'Hobbycraft' 'Lloyds Pharmacy'
 'Gamestation' 'CeX' 'Etsy' 'Five Senses Art' 'Sports Direct' 'Boots'
 'Reebok' 'JD Sports' 'Head' 'Frankie & Bennies' 'Gap Kids'
 'University College Hospital' 'CPA' 'Happy Days Home' 'Pets at Home'
 'Jollyes' 'Collector Cave' 'Vision Express' 'Green Park'


- Got roughly the same amount of merchants as in the last dataset but with actual business names.

#### Pricing distributions across similar merchants - supermarkets

In [29]:
# drop transactions where 'Amount' is positive as these are not spending out of the account
merchant_spending = bank[bank['Amount'] < 0]

# keep rows where 'Third Party Account No' is null as these aren't payments to personal accounts
merchant_spending = merchant_spending[merchant_spending['Third Party Account No'].isnull()]


merchant_spending.head(10)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
495,01/01/2023,00:00,355733816.0,526.0,-664.0,,Halifax
496,01/01/2023,00:00,472213568.0,2412.0,-600.0,,LBG
497,01/01/2023,00:00,624500124.0,2572.0,-654.0,,LBG
498,01/01/2023,00:00,203466392.0,3935.66,-672.0,,Halifax
499,01/01/2023,00:00,768271776.0,2208.0,-632.0,,LBG
500,01/01/2023,00:00,564744955.0,938.0,-572.0,,Halifax
501,01/01/2023,00:00,675806859.0,452.0,-648.0,,LBG
502,01/01/2023,00:00,456221621.0,2167.0,-664.0,,Halifax
503,01/01/2023,00:00,350921975.0,1333.5,-665.0,,Halifax
504,01/01/2023,00:00,331450202.0,776.0,-631.0,,LBG


In [30]:
# select only the transactions that are from the merchant 'Tesco' and 'Sainsbury Local'
tesco = merchant_spending[merchant_spending['Third Party Name'] == 'Tesco']
sainsbury_local = merchant_spending[merchant_spending['Third Party Name'] == 'Sainsbury Local']
sainsbury = merchant_spending[merchant_spending['Third Party Name'] == 'Sainsbury']

supermarkets = pd.concat([sainsbury_local['Amount'].describe(), tesco['Amount'].describe(), sainsbury['Amount'].describe()],axis=1)
supermarkets.columns = ['Sainsbury Local', 'Tesco', 'Sainsbury']
supermarkets

Unnamed: 0,Sainsbury Local,Tesco,Sainsbury
count,3155.0,33905.0,3117.0
mean,-42.450051,-75.041815,-43.679734
std,42.938658,50.944684,43.357148
min,-320.75,-348.35,-303.8
25%,-57.905,-102.44,-60.28
50%,-31.12,-62.98,-31.31
75%,-11.935,-38.27,-11.97
max,-0.01,-0.01,-0.01


#### Pricing distributions over coffee shops

In [31]:
starbucks = merchant_spending[merchant_spending['Third Party Name'] == 'Starbucks']
costa_coffee = merchant_spending[merchant_spending['Third Party Name'] == 'Costa Coffee']
coffee_1 = merchant_spending[merchant_spending['Third Party Name'] == 'Coffee #1']

coffee = pd.concat([starbucks['Amount'].describe(), costa_coffee['Amount'].describe(), coffee_1['Amount'].describe()],axis=1)
coffee.columns = ['Starbucks', 'Costa Coffee', 'Coffee #1']
coffee

Unnamed: 0,Starbucks,Costa Coffee,Coffee #1
count,4821.0,677.0,2298.0
mean,-5.984505,-5.988774,-5.995735
std,1.509033,0.94516,1.785399
min,-90.0,-8.3,-80.0
25%,-6.5,-6.5,-6.5
50%,-5.8,-5.8,-5.75
75%,-5.4,-5.35,-5.4
max,-2.8,-2.8,-2.8


- Beginning to see rough alignment in terms of spending mean and variance

## Investigate Missing Values

In [35]:
# null values across each feature
bank.isnull().sum()

Date                         229
Timestamp                    251
Account No                   224
Balance                      246
Amount                       209
Third Party Account No    223764
Third Party Name            7079
dtype: int64

##### Null third party account id

In [38]:
# display null values in 'Third Party Account No'
null_beneficiary_accounts=bank[bank['Third Party Account No'].isnull()]
null_beneficiary_accounts

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
0,01/01/2023,00:00,678330503.0,2971.000000,1584.00,,Westport Care Home
1,01/01/2023,00:00,472213568.0,3792.000000,1950.00,,Barbiee Boutique
4,01/01/2023,00:00,624500124.0,3226.000000,1825.00,,Fat Face
5,01/01/2023,00:00,203466392.0,4607.660000,2841.66,,Lavender Primary
6,01/01/2023,00:00,768271776.0,3620.000000,1950.00,,A Cut Above
...,...,...,...,...,...,...,...
230591,06/12/2023,20:54,581655972.0,45935.206861,-41.06,,Tesco
230592,06/12/2023,20:55,786141370.0,-244.837500,-62.35,,Sainsbury Local
230593,06/12/2023,21:05,824916823.0,9709.172159,-32.94,,Deliveroo
230594,06/12/2023,21:13,366550080.0,26834.165794,-19.25,,Amazon


- NaNs in the 'Third Party Account No' indicate a transaction at a merchant. If these are negative - indicates a payment to the merchant, if positive it indicates and payment from the merchant - most likely a refund or salary.

#### Null third party name

In [39]:
# display null values in 'Third Party Account No'
null_merchant_name=bank[bank['Third Party Name'].isnull()]
null_merchant_name

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
2,01/01/2023,00:00,472213568.0,3012.000000,-780.00,283027736.0,
3,01/01/2023,00:00,283027736.0,1787.000000,780.00,472213568.0,
7,01/01/2023,00:00,768271776.0,2840.000000,-780.00,215404070.0,
8,01/01/2023,00:00,215404070.0,1965.000000,780.00,768271776.0,
11,01/01/2023,00:00,350921975.0,1998.500000,-460.00,633362865.0,
...,...,...,...,...,...,...,...
228822,04/12/2023,01:08,336171116.0,428.428400,380.00,973388795.0,
228859,04/12/2023,03:09,349614277.0,748.375100,-950.00,871200314.0,
228860,04/12/2023,03:09,871200314.0,539.105124,950.00,349614277.0,
229016,04/12/2023,08:34,215426302.0,1118.800404,-43.20,,


- NaNs in third part name/merchant indicate a personal payment to another account - e.g. rent or potentially fraud?

#### Null values in balance

In [51]:
null_balance = bank[bank['Balance'].isnull()]   
null_balance.head()

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
1392,01/01/2023,00:00,246753533.0,,-100.0,,Grand Union BJJ
4132,01/01/2023,00:00,726913482.0,,-5.99,,Mojang Studios
4593,01/01/2023,00:00,335698481.0,,-18.99,,PureGym
4771,01/01/2023,09:49,706250160.0,,-60.92,,The Works
4799,01/01/2023,10:01,647472692.0,,-27.96,,The Works


#### Null sender account number

In [50]:
null_sender_account = bank[bank['Account No'].isnull()]   
null_sender_account.head()

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
527,01/01/2023,00:00,,813.0,-1143.0,,Halifax
1295,01/01/2023,00:00,,2284.03,-18.99,,PureGym
1335,01/01/2023,00:00,,1542.03,-18.99,,PureGym
1563,01/01/2023,00:00,,3351.01,-15.99,,Netflix
1703,01/01/2023,00:00,,3785.78,-18.99,,PureGym


#### Null timestamp

In [54]:
null_timestamp = bank[bank['Timestamp'].isnull()]
null_timestamp.head()    

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
2734,01/01/2023,,593191200.0,1490.02,-14.99,,Blizzard
6129,02/01/2023,,572384506.0,3193.04,-9.99,,The Works
6527,03/01/2023,,196289502.0,855.8792,-47.56,,Tesco
6693,03/01/2023,,421475159.0,1375.453,-99.98,,Gamestation
7227,04/01/2023,,760704118.0,2636.7748,-221.89,,Fat Face


#### Null dates

In [57]:
null_dates = bank[bank['Date'].isnull()]    
null_dates.head()

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
3918,,00:00,554792076.0,575.02,-5.99,,Mojang Studios
4106,,00:00,581655972.0,6895.01,-15.99,,Netflix
4511,,00:00,711140907.0,1217.0,-7.0,,Xbox
4710,,09:23,310620484.0,992.08,-11.98,,The Works
5931,,09:33,550832673.0,631.12,-53.92,,Blackwell's
