In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# import 2nd dataset from lloyds
bank = pd.read_csv('simulated_transaction_2024.csv')

In [25]:
bank.tail(10)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
230586,06/12/2023,20:39,166019891.0,9482.471051,-62.47,,AMAZON
230587,06/12/2023,20:40,975345315.0,40907.910578,-73.71,,Tesco
230588,06/12/2023,20:43,106601471.0,11920.401863,-37.25,,Coop Local
230589,06/12/2023,20:48,522188082.0,8471.141437,-17.05,,AMAZON
230590,06/12/2023,20:50,256002321.0,7582.389937,-24.8,,Sainsbury Local
230591,06/12/2023,20:54,581655972.0,45935.206861,-41.06,,Tesco
230592,06/12/2023,20:55,786141370.0,-244.8375,-62.35,,Sainsbury Local
230593,06/12/2023,21:05,824916823.0,9709.172159,-32.94,,Deliveroo
230594,06/12/2023,21:13,366550080.0,26834.165794,-19.25,,Amazon
230595,06/12/2023,21:32,259015404.0,1145.349837,-37.64,,JustEat


## Investigate Categories

In [9]:
merchants = bank['Third Party Name'].unique()
print(merchants)

# print number of unique merchants
print('Number of unique merchants: ', len(merchants))

['Westport Care Home' 'Barbiee Boutique' nan 'Fat Face' 'Lavender Primary'
 'A Cut Above' 'Tesco' 'Brilliant Brushes' 'Costa Coffee'
 'Premier Finance' 'North Face' 'Craftastic' 'Fitted Stitch' 'Selfridges'
 'Starbucks' 'The Crown' 'Cass Art' 'Green Park Academy' 'Rose & Crown'
 'Kings Arms' 'Sunny Care Nursery' 'A Yarn Story' 'Pets Corner'
 'Lavender Fields' 'Victoria Park' 'Halifax' 'LBG' 'Blizzard' 'Xbox'
 'Mojang Studios' 'PureGym' 'Disney' 'Netflix' 'Grand Union BJJ' 'Amazon'
 'SquareOnix' 'Deliveroo' 'JustEat' 'Coop Local' 'Sainsbury' 'AMAZON'
 'The Works' "Blackwell's" 'Topshop' 'Matalan' 'Foyles' 'Wool'
 'Hobby Lobby' 'Revella' 'Sainsbury Local' 'Loosely Fitted'
 'Stitch By Stitch' 'Coffee #1' 'Hobbycraft' 'Lloyds Pharmacy'
 'Gamestation' 'CeX' 'Etsy' 'Five Senses Art' 'Sports Direct' 'Boots'
 'Reebok' 'JD Sports' 'Head' 'Frankie & Bennies' 'Gap Kids'
 'University College Hospital' 'CPA' 'Happy Days Home' 'Pets at Home'
 'Jollyes' 'Collector Cave' 'Vision Express' 'Green Park'


- Got roughly the same amount of merchants as in the last dataset but with actual business names.

#### Pricing distributions across similar merchants - supermarkets

In [29]:
# drop transactions where 'Amount' is positive as these are not spending out of the account
merchant_spending = bank[bank['Amount'] < 0]

# keep rows where 'Third Party Account No' is null as these aren't payments to personal accounts
merchant_spending = merchant_spending[merchant_spending['Third Party Account No'].isnull()]


merchant_spending.head(10)

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
495,01/01/2023,00:00,355733816.0,526.0,-664.0,,Halifax
496,01/01/2023,00:00,472213568.0,2412.0,-600.0,,LBG
497,01/01/2023,00:00,624500124.0,2572.0,-654.0,,LBG
498,01/01/2023,00:00,203466392.0,3935.66,-672.0,,Halifax
499,01/01/2023,00:00,768271776.0,2208.0,-632.0,,LBG
500,01/01/2023,00:00,564744955.0,938.0,-572.0,,Halifax
501,01/01/2023,00:00,675806859.0,452.0,-648.0,,LBG
502,01/01/2023,00:00,456221621.0,2167.0,-664.0,,Halifax
503,01/01/2023,00:00,350921975.0,1333.5,-665.0,,Halifax
504,01/01/2023,00:00,331450202.0,776.0,-631.0,,LBG


In [30]:
# select only the transactions that are from the merchant 'Tesco' and 'Sainsbury Local'
tesco = merchant_spending[merchant_spending['Third Party Name'] == 'Tesco']
sainsbury_local = merchant_spending[merchant_spending['Third Party Name'] == 'Sainsbury Local']
sainsbury = merchant_spending[merchant_spending['Third Party Name'] == 'Sainsbury']

supermarkets = pd.concat([sainsbury_local['Amount'].describe(), tesco['Amount'].describe(), sainsbury['Amount'].describe()],axis=1)
supermarkets.columns = ['Sainsbury Local', 'Tesco', 'Sainsbury']
supermarkets

Unnamed: 0,Sainsbury Local,Tesco,Sainsbury
count,3155.0,33905.0,3117.0
mean,-42.450051,-75.041815,-43.679734
std,42.938658,50.944684,43.357148
min,-320.75,-348.35,-303.8
25%,-57.905,-102.44,-60.28
50%,-31.12,-62.98,-31.31
75%,-11.935,-38.27,-11.97
max,-0.01,-0.01,-0.01


#### Pricing distributions over coffee shops

In [31]:
starbucks = merchant_spending[merchant_spending['Third Party Name'] == 'Starbucks']
costa_coffee = merchant_spending[merchant_spending['Third Party Name'] == 'Costa Coffee']
coffee_1 = merchant_spending[merchant_spending['Third Party Name'] == 'Coffee #1']

coffee = pd.concat([starbucks['Amount'].describe(), costa_coffee['Amount'].describe(), coffee_1['Amount'].describe()],axis=1)
coffee.columns = ['Starbucks', 'Costa Coffee', 'Coffee #1']
coffee

Unnamed: 0,Starbucks,Costa Coffee,Coffee #1
count,4821.0,677.0,2298.0
mean,-5.984505,-5.988774,-5.995735
std,1.509033,0.94516,1.785399
min,-90.0,-8.3,-80.0
25%,-6.5,-6.5,-6.5
50%,-5.8,-5.8,-5.75
75%,-5.4,-5.35,-5.4
max,-2.8,-2.8,-2.8


- Beginning to see rough alignment in terms of spending mean and variance

## Investigate Missing Values