In [2]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocess transactional data

Going to drop any personal account-to-personal account transactions and just keep the business payments

In [3]:
bank_max = pd.read_csv('mapped_df.csv')

In [4]:
bank_max.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,group
0,10371.0,4.0,CINEMA,01/01/2025,Cinema
1,88339.0,2.4,40544,01/01/2025,
2,18555.0,2.4,85149,01/01/2025,
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025,Cafe
4,80792.0,1.95,18555,01/01/2025,


In [5]:
# drop row with NaN in 'group' column - remove C2C transactions
# Only store C2B transactions
bank = bank_max.dropna(subset=['group'])

bank.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,group
0,10371.0,4.0,CINEMA,01/01/2025,Cinema
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025,Cafe
5,18555.0,4.45,TOTALLY_A_REAL_COFFEE_SHOP,01/01/2025,Cafe
6,18555.0,1.45,COFFEE_SHOP,01/01/2025,Cafe
7,18555.0,5.0,CAFE,01/01/2025,Cafe


In [6]:
# filter for dataframe with 'from_totally_fake_account' == 10371
bank_id= bank[bank['from_totally_fake_account'] == 10371.0]

# drop 'to_randomly_generated_account' column
bank_id = bank_id.drop(columns=['to_randomly_generated_account','not_happened_yet_date'])

bank_id.head(7)

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,group
0,10371.0,4.0,Cinema
8107,10371.0,1.45,Cafe
8109,10371.0,2.55,Cafe
8118,10371.0,2.65,Cafe
18537,10371.0,8.99,Streaming Service
29117,10371.0,2.2,Cafe
29120,10371.0,2.65,Cafe


In [7]:
print(bank_id.index)

Index([       0,     8107,     8109,     8118,    18537,    29117,    29120,
          59058,    59059,    59060,
       ...
       10040626, 10040652, 10087270, 10103856, 10103857, 10121181, 10139127,
       10139128, 10139141, 10139159],
      dtype='int64', length=1347)


## Pivot to compute average spend per category - for a single account number

In [8]:
bank_id_pivoted = bank_id.pivot_table(index='from_totally_fake_account',
                                columns='group',
                                values='monopoly_money_amount',
                                aggfunc='mean')

bank_id_pivoted.head(20)

group,Alcohol Store,Bar,Cafe,Cinema,Fast Food,Gaming,Home,Pets,Pub,Restaurant/Takeaway,Roasterie/Tea Shop,Streaming Service,Supermarket
from_totally_fake_account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10371.0,16.995,12.316062,2.413517,6.451613,3.928571,43.0,47.0,20.25,13.87063,30.227917,22.0425,8.99,96.84942


## Pivot for every account number

In [16]:
bank_category_pivoted = bank.pivot_table(index='from_totally_fake_account',
                                columns='group',
                                values='monopoly_money_amount',
                                aggfunc='mean')

bank_category_pivoted.shape

(8142, 26)

- Dataframe has the correct number of rows for each person.

In [17]:
bank_category_pivoted.head(10)

group,Accessory Shop,Alcohol Store,Bar,Book Store,Butcher,Cafe,Children's Shop,Cinema,Clothing Store,Cook Shop,...,Home,Pets,Pub,Restaurant/Takeaway,Roasterie/Tea Shop,Sports Store,Streaming Service,Supermarket,Tech Store,Turkey Farm
from_totally_fake_account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000.0,,38.326667,11.895753,8.0,,2.40492,80.0,6.146341,145.0,,...,46.5,25.190476,15.157426,34.327447,22.726667,,8.99,85.24125,,
1002.0,,51.656667,11.846698,,,2.348301,67.0,6.18,,,...,,,16.369496,31.992979,20.606667,87.5,8.99,95.055455,,
1006.0,,21.59,11.776316,11.444444,21.521449,2.428012,59.2,5.820513,65.0,16.555556,...,44.0,,15.5744,30.310769,31.36,,8.99,20.693333,,
1018.0,,19.995,5.955986,10.0,21.617021,2.507692,59.0,5.6,118.176471,,...,71.0,30.0,6.4,34.061923,26.365,84.0,,23.003929,,
1053.0,,59.99,11.77451,,18.0,2.688333,33.25,6.166667,111.347826,,...,42.454545,,13.153661,19.236889,31.8175,115.0,8.99,86.093846,79.0,
1059.0,,10.0,12.035294,10.434783,,2.392663,50.764706,4.0,128.0,,...,,,15.006667,35.029767,34.015,68.666667,,67.843125,,
1071.0,,36.99,6.47037,17.5,30.0,2.473515,13.0,6.96875,,23.0,...,,,10.407987,33.52439,22.12,68.384615,8.99,100.036667,235.5,
1078.0,,16.995,12.070621,,15.923077,2.397773,49.642857,6.52381,,13.333333,...,,,15.54747,31.175111,39.09,80.5,8.99,90.596557,,
1086.0,,56.79,5.773529,12.5,22.218182,2.470244,52.045455,6.243902,116.6,,...,51.75,60.0,11.298896,31.097143,38.033333,117.0,8.99,23.457083,,
1087.0,,29.99,12.090734,7.166667,,2.447219,55.5,4.75,53.0,25.0,...,,,14.815909,33.934681,17.273333,,,114.45,,


- Can see NaNs where people haven't spent money.
- Replace these values with £0.

In [18]:
# replace NaN with 0
bank_category_pivoted_final = bank_category_pivoted.fillna(0)
bank_category_pivoted_final.head(10)

group,Accessory Shop,Alcohol Store,Bar,Book Store,Butcher,Cafe,Children's Shop,Cinema,Clothing Store,Cook Shop,...,Home,Pets,Pub,Restaurant/Takeaway,Roasterie/Tea Shop,Sports Store,Streaming Service,Supermarket,Tech Store,Turkey Farm
from_totally_fake_account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000.0,0.0,38.326667,11.895753,8.0,0.0,2.40492,80.0,6.146341,145.0,0.0,...,46.5,25.190476,15.157426,34.327447,22.726667,0.0,8.99,85.24125,0.0,0.0
1002.0,0.0,51.656667,11.846698,0.0,0.0,2.348301,67.0,6.18,0.0,0.0,...,0.0,0.0,16.369496,31.992979,20.606667,87.5,8.99,95.055455,0.0,0.0
1006.0,0.0,21.59,11.776316,11.444444,21.521449,2.428012,59.2,5.820513,65.0,16.555556,...,44.0,0.0,15.5744,30.310769,31.36,0.0,8.99,20.693333,0.0,0.0
1018.0,0.0,19.995,5.955986,10.0,21.617021,2.507692,59.0,5.6,118.176471,0.0,...,71.0,30.0,6.4,34.061923,26.365,84.0,0.0,23.003929,0.0,0.0
1053.0,0.0,59.99,11.77451,0.0,18.0,2.688333,33.25,6.166667,111.347826,0.0,...,42.454545,0.0,13.153661,19.236889,31.8175,115.0,8.99,86.093846,79.0,0.0
1059.0,0.0,10.0,12.035294,10.434783,0.0,2.392663,50.764706,4.0,128.0,0.0,...,0.0,0.0,15.006667,35.029767,34.015,68.666667,0.0,67.843125,0.0,0.0
1071.0,0.0,36.99,6.47037,17.5,30.0,2.473515,13.0,6.96875,0.0,23.0,...,0.0,0.0,10.407987,33.52439,22.12,68.384615,8.99,100.036667,235.5,0.0
1078.0,0.0,16.995,12.070621,0.0,15.923077,2.397773,49.642857,6.52381,0.0,13.333333,...,0.0,0.0,15.54747,31.175111,39.09,80.5,8.99,90.596557,0.0,0.0
1086.0,0.0,56.79,5.773529,12.5,22.218182,2.470244,52.045455,6.243902,116.6,0.0,...,51.75,60.0,11.298896,31.097143,38.033333,117.0,8.99,23.457083,0.0,0.0
1087.0,0.0,29.99,12.090734,7.166667,0.0,2.447219,55.5,4.75,53.0,25.0,...,0.0,0.0,14.815909,33.934681,17.273333,0.0,0.0,114.45,0.0,0.0
