In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [97]:
#print max rows
pd.set_option('display.max_rows', 500)

bank = pd.read_csv('fake_transactional_data_24.csv')

In [145]:
bank.shape

(10148280, 5)

In [98]:
# assign a column to bank dataframe that stores 1 if the value in 'to_randomly_generated_account' is non-numerical and 0 otherwise
bank['test_string'] = bank['to_randomly_generated_account'].apply(lambda x: 1 if not x.isdigit() else 0)

# print unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string'] == 1]['to_randomly_generated_account'].unique())

#print number of unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string'] == 1]['to_randomly_generated_account'].nunique())

['CINEMA' 'HIPSTER_COFFEE_SHOP' 'TOTALLY_A_REAL_COFFEE_SHOP' 'COFFEE_SHOP'
 'CAFE' 'A_CAFE' 'LOCAL_RESTAURANT' 'A_LOCAL_COFFEE_SHOP'
 'GOURMET_COFFEE_SHOP' 'LOCAL_WATERING_HOLE' 'SANDWICH_SHOP' 'TOY_SHOP'
 'PRETENTIOUS_COFFEE_SHOP' 'BAR' 'PUB' 'COMIC_BOOK_SHOP' 'LUNCH_VAN'
 'DEPARTMENT_STORE' 'KEBAB_SHOP' 'WINE_BAR' 'ELECTRONICS_SHOP'
 'RESTAURANT' 'LOCAL_PUB' 'LUNCH_PLACE' 'FASHION_SHOP'
 'FASHIONABLE_SPORTSWARE_SHOP' 'SCHOOL_SUPPLY_STORE' 'LOCAL_BOOKSHOP'
 'TRAINER_SHOP' 'BOOKSHOP' 'KIDS_ACTIVITY_CENTRE' 'VIDEO_GAME_STORE'
 'CLOTHES_SHOP' 'TAKEAWAY_CURRY' 'TECH_SHOP' 'NERDY_BOOK_STORE'
 'WHISKEY_BAR' 'PET_TOY_SHOP' 'DVD_SHOP' 'CHILDRENDS_SHOP' 'GAME_SHOP'
 'INDIAN_RESTAURANT' 'COCKTAIL_BAR' 'RUNNING_SHOP' 'DIY_STORE' 'COOKSHOP'
 'HOME_IMPROVEMENT_STORE' 'PET_SHOP' 'CHINESE_TAKEAWAY' 'BUTCHERS'
 'SECOND_HAND_BOOKSHOP' 'G&T_BAR' 'GREENGROCER' 'JEWLLERY_SHOP'
 'ACCESSORY_SHOP' 'TAKEAWAY' 'KIDS_CLOTHING_SHOP' 'SPORT_SHOP'
 'STEAK_HOUSE' 'HIPSTER_ELECTRONICS_SHOP' 'CHINESE_RESTAURANT'
 'S

## Current categorisation

### Proposed categories:

- `Alcohol shop`
- `Coffee`
- `Restaurant`
- `Entertainment` - cinemas and online streaming
- `Home Goods`
- `Supermarkets`
- `Apparrel/Clothes`
- `Book stores`
- `Gym`  
- `Pub/bars`
- `Takeaway`
- `Sports shops`
- `Children`
- `Pet shops`   
- `Technology stores`
- `Greengrocer`
- `Butcher`
- `Game/DVD store`

In [99]:
# regex search patterns
val_replace_cafe = ['coffee', 'tea', 'cafe']
val_replace_book_store = ['book', 'book store', 'bookshop']  
val_replace_alcohol = ['WINE_CELLAR', 'alcohol', 'liquor','whiskey']
val_replace_pub = ['bar','pub','LOCAL_WATERING_HOLE']
val_replace_restaurant = ['restaurant','SEAFOOD_RESAURANT' , 'steak house', 'KEBAB_SHOP', 'SANDWICH_SHOP', 'roasterie','lunch']
val_replace_entertainment = ['cinema','streaming']
val_replace_home = ['DIY', 'home', 'COOKSHOP']
val_replace_supermarket = ['supermarket']
val_replace_green_grocer = ['GREENGROCER']
val_replace_clothing = ['clothing', 'fashion', 'clothes', 'ACCESSORY_SHOP']
val_replace_gym = ['gym']
val_replace_takeaway = ['takeaway']
val_replace_sports_store = ['sport','sportsware','running','trainer','FASHIONABLE_SPORTSWARE_SHOP']
val_replace_children = ['child','toy', 'kids', 'CHILDRENDS_SHOP']
val_replace_pets = ['pet', 'PET_TOY_SHOP']
val_replace_tech_store = ['tech','electronics']
val_replace_butcher = ['BUTCHER', 'TURKEY_FARM']
val_replace_game = ['game', 'DVD']
val_replace_jeweller = ['JEWLLERY_SHOP']

# replacement group categories
merchant_names = ['Cafe', 'Book Store','Alcohol', 'Pub', 'Restaurant', 'Entertainment', 'Home',
                  'Supermarket', 'Green Grocer', 'Clothing', 'Gym', 'Takeaway', 'Sports Store', 'Children','Pets', 'Tech Store', 'Butcher', 'Game', 'Jewellery']

In [100]:
grouping_conditions = [
    
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_cafe), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_book_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_alcohol), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_pub), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_restaurant), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_entertainment), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_home), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_supermarket), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_green_grocer), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_clothing), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_gym), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_takeaway), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_sports_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_children), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_pets), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_tech_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_butcher), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_game), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_jeweller), case=False, regex=True)

]


In [101]:
# assign new merchant categories
bank['business_name'] = np.select(grouping_conditions, merchant_names, default='')

## Spending distributions for kids categories

`KIDS_ACTIVITY_CENTRE`, `KIDS_CLOTHING_SHOP`,`SCHOOL_SUPPLY_STORE`,`TOY_SHOP`,`CHILDRENDS_SHOP`

In [102]:
# drop 'test_string' column
bank = bank.drop('test_string', axis=1)

In [103]:
# select rows where 'to_randomly_generated_account'== 'CHILDRENDS_SHOP'
bank_childrends_shop=bank[bank['to_randomly_generated_account'] == 'CHILDRENDS_SHOP']
bank_kids_activity_centre=bank[bank['to_randomly_generated_account'] == 'KIDS_ACTIVITY_CENTRE']
bank_kids_clothing_shop=bank[bank['to_randomly_generated_account'] == 'KIDS_CLOTHING_SHOP']
bank_toy_shop=bank[bank['to_randomly_generated_account'] == 'TOY_SHOP']
bank_school_supply_store=bank[bank['to_randomly_generated_account'] == 'SCHOOL_SUPPLY_STORE']


# print the first 5 rows of the bank_childrends_shop dataframe
bank_childrends_shop.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
372,73526.0,12.0,CHILDRENDS_SHOP,01/01/2025,Children
402,63045.0,70.0,CHILDRENDS_SHOP,01/01/2025,Children
833,86227.0,46.0,CHILDRENDS_SHOP,01/01/2025,Children
1259,55112.0,40.0,CHILDRENDS_SHOP,01/01/2025,Children
1447,23234.0,25.0,CHILDRENDS_SHOP,01/01/2025,Children


In [104]:
bank_kids_activity_centre.head()    

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
282,16155.0,97.0,KIDS_ACTIVITY_CENTRE,01/01/2025,Children
514,97810.0,84.0,KIDS_ACTIVITY_CENTRE,01/01/2025,Children
854,4518.0,55.0,KIDS_ACTIVITY_CENTRE,01/01/2025,Children
1085,15315.0,63.0,KIDS_ACTIVITY_CENTRE,01/01/2025,Children
1274,47699.0,72.0,KIDS_ACTIVITY_CENTRE,01/01/2025,Children


In [105]:
kids_shop_distri = pd.concat([bank_childrends_shop['monopoly_money_amount'].describe(), bank_kids_activity_centre['monopoly_money_amount'].describe(), bank_kids_clothing_shop['monopoly_money_amount'].describe(), bank_toy_shop['monopoly_money_amount'].describe(), bank_school_supply_store['monopoly_money_amount'].describe()], axis=1)

col_names = ['bank_childrends_shop', 'bank_kids_activity_centre', 'bank_kids_clothing_shop', 'bank_toy_shop', 'bank_school_supply_store']

kids_shop_distri.columns = col_names

kids_shop_distri

Unnamed: 0,bank_childrends_shop,bank_kids_activity_centre,bank_kids_clothing_shop,bank_toy_shop,bank_school_supply_store
count,16638.0,16519.0,16582.0,16422.0,16383.0
mean,55.06,55.299,55.057,55.433,54.713
std,26.017,26.121,26.007,25.985,26.168
min,10.0,10.0,10.0,10.0,10.0
25%,33.0,33.0,33.0,33.0,32.0
50%,55.0,55.0,55.0,56.0,54.0
75%,78.0,78.0,78.0,78.0,77.0
max,100.0,100.0,100.0,100.0,100.0


- Can see clearly all the means and variances are roughly equal.
- It is safe to combine these merchants into a single 'kids' category.

## Spending categories for entertainment

`CINEMA` `GAME_SHOP` `DVD_SHOP` `STREAMING_SERVICE` `VIDEO_GAME_STORE`

In [106]:
# select rows where 'to_randomly_generated_account'== 'CHILDRENDS_SHOP'
bank_cinema=bank[bank['to_randomly_generated_account'] == 'CINEMA']
bank_game_shop=bank[bank['to_randomly_generated_account'] == 'GAME_SHOP']
bank_dvd_shop=bank[bank['to_randomly_generated_account'] == 'DVD_SHOP']
bank_streaming_service=bank[bank['to_randomly_generated_account'] == 'STREAMING_SERVICE']
bank_video_game=bank[bank['to_randomly_generated_account'] == 'VIDEO_GAME_STORE']


bank_streaming_service.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
13568,66821.0,8.99,STREAMING_SERVICE,01/01/2025,Entertainment
13569,20801.0,8.99,STREAMING_SERVICE,01/01/2025,Entertainment
13570,15896.0,8.99,STREAMING_SERVICE,01/01/2025,Entertainment
13571,90153.0,8.99,STREAMING_SERVICE,01/01/2025,Entertainment
13572,36202.0,8.99,STREAMING_SERVICE,01/01/2025,Entertainment


In [107]:
entertainment_distri = pd.concat([bank_cinema['monopoly_money_amount'].describe(),
                                  bank_game_shop['monopoly_money_amount'].describe(),
                                  bank_dvd_shop['monopoly_money_amount'].describe(),
                                  bank_streaming_service['monopoly_money_amount'].describe(),
                                  bank_video_game['monopoly_money_amount'].describe()], axis=1)

col_names = ['CINEMA', 'GAME_SHOP', 'DVD_SHOP', 'STREAMING_SERVICE', 'VIDEO_GAME_STORE']

entertainment_distri.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
entertainment_distri

Unnamed: 0,CINEMA,GAME_SHOP,DVD_SHOP,STREAMING_SERVICE,VIDEO_GAME_STORE
count,67715.0,19138.0,67686.0,39444.0,19210.0
mean,6.401,40.071,6.398,8.99,40.105
std,2.065,11.669,2.056,0.0,11.747
min,4.0,20.0,4.0,8.99,20.0
25%,5.0,30.0,5.0,8.99,30.0
50%,6.0,40.0,6.0,8.99,40.0
75%,7.0,50.0,7.0,8.99,50.0
max,10.0,60.0,10.0,8.99,60.0


- Cinema and DVD_shop can be grouped together.
- Game shop and video game shop can be grouped together.
- Streaming service should be an independant category.

## Spending categories for home goods


`HOME_IMPROVEMENT_STORE` `DIY_STORE` 

In [108]:
bank_home_improvement = bank[bank['to_randomly_generated_account'] == 'HOME_IMPROVEMENT_STORE']
bank_diy = bank[bank['to_randomly_generated_account'] == 'DIY_STORE']

bank_home_improvement.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
687,25168.0,44.0,HOME_IMPROVEMENT_STORE,01/01/2025,Home
1303,27649.0,26.0,HOME_IMPROVEMENT_STORE,01/01/2025,Home
1439,50741.0,90.0,HOME_IMPROVEMENT_STORE,01/01/2025,Home
1674,66817.0,28.0,HOME_IMPROVEMENT_STORE,01/01/2025,Home
5846,51408.0,42.0,HOME_IMPROVEMENT_STORE,01/01/2025,Home


In [109]:
home = pd.concat([bank_home_improvement['monopoly_money_amount'].describe(),
                  bank_diy['monopoly_money_amount'].describe()], axis=1)

col_names = ['HOME_IMPROVEMENT_STORE', 'DIY_STORE']

home.columns = col_names

pd.options.display.float_format = '{:.3f}'.format

home

Unnamed: 0,HOME_IMPROVEMENT_STORE,DIY_STORE
count,6350.0,6522.0
mean,47.365,46.774
std,24.749,24.896
min,5.0,5.0
25%,26.0,25.0
50%,47.0,47.0
75%,69.0,69.0
max,90.0,90.0


- The variances and means are aligned well for these categories so we can justify these merchants being included into a single category.

## Spending categories for sports apparrel and clothing shops

`RUNNING_SHOP` `SPORT_SHOP` `FASHIONABLE_SPORTSWARE_SHOP` `DEPARTMENT_STORE` `FASHION_SHOP` `CLOTHES_SHOP` `TRAINER_SHOP` `JEWLLERY_SHOP`

In [110]:
bank_running_shop = bank[bank['to_randomly_generated_account'] == 'RUNNING_SHOP']
bank_sports_shop = bank[bank['to_randomly_generated_account'] == 'SPORT_SHOP']
bank_fash_sports = bank[bank['to_randomly_generated_account'] == 'FASHIONABLE_SPORTSWARE_SHOP']
bank_dep_store = bank[bank['to_randomly_generated_account'] == 'DEPARTMENT_STORE']
bank_fashion_shop = bank[bank['to_randomly_generated_account'] == 'FASHION_SHOP']
bank_clothes_shop = bank[bank['to_randomly_generated_account'] == 'CLOTHES_SHOP']
bank_trainers_shop = bank[bank['to_randomly_generated_account'] == 'TRAINER_SHOP']
bank_accessory_shop = bank[bank['to_randomly_generated_account'] == 'ACCESSORY_SHOP']
bank_jewellery_shop = bank[bank['to_randomly_generated_account'] == 'JEWLLERY_SHOP']

bank_sports_shop.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
1369,95243.0,61.0,SPORT_SHOP,01/01/2025,Sports Store
6694,95000.0,48.0,SPORT_SHOP,01/01/2025,Sports Store
7080,70215.0,94.0,SPORT_SHOP,01/01/2025,Sports Store
11719,20952.0,115.0,SPORT_SHOP,01/01/2025,Sports Store
15902,45355.0,98.0,SPORT_SHOP,01/01/2025,Sports Store


In [111]:
clothes = pd.concat([bank_running_shop['monopoly_money_amount'].describe(),
                    bank_sports_shop['monopoly_money_amount'].describe(),
                    bank_fash_sports['monopoly_money_amount'].describe(), 
                    bank_dep_store['monopoly_money_amount'].describe(),
                    bank_fashion_shop['monopoly_money_amount'].describe(),
                    bank_clothes_shop['monopoly_money_amount'].describe(),
                    bank_trainers_shop['monopoly_money_amount'].describe(),
                    bank_accessory_shop['monopoly_money_amount'].describe(),
                    bank_jewellery_shop['monopoly_money_amount'].describe()],
                    axis=1)

col_names = ['RUNNING_SHOP','SPORT_SHOP', 'FASHIONABLE_SPORTSWARE_SHOP', 'DEPARTMENT_STORE','FASHION_SHOP', 'CLOTHES_SHOP', 'TRAINER_SHOP', 'ACCESSORY_SHOP', 'JEWLLERY_SHOP']

clothes.columns = col_names

pd.options.display.float_format = '{:.3f}'.format

clothes

Unnamed: 0,RUNNING_SHOP,SPORT_SHOP,FASHIONABLE_SPORTSWARE_SHOP,DEPARTMENT_STORE,FASHION_SHOP,CLOTHES_SHOP,TRAINER_SHOP,ACCESSORY_SHOP,JEWLLERY_SHOP
count,3858.0,3876.0,3825.0,24893.0,24816.0,24723.0,3913.0,3325.0,3247.0
mean,74.254,74.925,75.714,115.561,114.588,114.417,75.002,22.515,21.961
std,25.681,26.024,25.955,48.862,49.206,49.332,25.987,10.192,10.24
min,30.0,30.0,30.0,30.0,30.0,30.0,30.0,5.0,5.0
25%,52.0,52.0,53.0,74.0,72.0,72.0,53.0,14.0,13.0
50%,73.0,76.0,76.0,116.0,114.0,114.0,75.0,22.0,22.0
75%,96.0,98.0,98.0,158.0,157.0,157.0,98.0,32.0,31.0
max,120.0,120.0,120.0,200.0,200.0,200.0,120.0,40.0,40.0


- Sports shops should a distinct category - and running shop + trainer shop can be included in this category
- Department store can be combined with regular clothes/fashion stores.
- Jewellery shop and accessory shop can be potentially combined into a single category.

## Spending categories for fresh meat stores

`BUTCHERS` `BUTCHER` `TURKEY_FARM`

In [112]:
bank_butcher1 = bank[bank['to_randomly_generated_account'] == 'BUTCHER']
bank_butcher2 = bank[bank['to_randomly_generated_account'] == 'BUTCHERS']
bank_turkey_farm = bank[bank['to_randomly_generated_account'] == 'TURKEY_FARM']

In [113]:
fresh_meat = pd.concat([bank_sports_shop['monopoly_money_amount'].describe(),
                    bank_fash_sports['monopoly_money_amount'].describe(), 
                    bank_dep_store['monopoly_money_amount'].describe()], axis=1)

col_names = ['BUTCHER', 'BUTCHERS', 'TURKEY_FARM']

fresh_meat.columns = col_names

pd.options.display.float_format = '{:.3f}'.format

fresh_meat

Unnamed: 0,BUTCHER,BUTCHERS,TURKEY_FARM
count,3876.0,3825.0,24893.0
mean,74.925,75.714,115.561
std,26.024,25.955,48.862
min,30.0,30.0,30.0
25%,52.0,53.0,74.0
50%,76.0,76.0,116.0
75%,98.0,98.0,158.0
max,120.0,120.0,200.0


- Both butchers have roughly the same variance and average spend. 
- Turkey farm should be separated out into another category.

## Spending categories for supermarkets

`GREENGROCER` `EXPRESS_SUPERMARKET` `LARGE_SUPERMARKET` `THE_SUPERMARKET` `A_SUPERMARKET` 

In [114]:
bank_greengrocer = bank[bank['to_randomly_generated_account'] == 'GREENGROCER']
bank_express_supermarket = bank[bank['to_randomly_generated_account'] == 'EXPRESS_SUPERMARKET']
bank_large_supermarket = bank[bank['to_randomly_generated_account'] == 'LARGE_SUPERMARKET']
bank_the_supermarket = bank[bank['to_randomly_generated_account'] == 'THE_SUPERMARKET']
bank_a_supermarket = bank[bank['to_randomly_generated_account'] == 'A_SUPERMARKET']

bank_greengrocer.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
829,6782.0,28.0,GREENGROCER,01/01/2025,Green Grocer
976,41707.0,11.0,GREENGROCER,01/01/2025,Green Grocer
1291,53618.0,27.0,GREENGROCER,01/01/2025,Green Grocer
1659,26819.0,26.0,GREENGROCER,01/01/2025,Green Grocer
1941,66899.0,14.0,GREENGROCER,01/01/2025,Green Grocer


- All the supermarkets can be grouped together due to similar means and variances.
- Greengrocer should be kept as a distinct category - due to its lower average spend and lower variance in spending.

## Spending categories for coffee/tea shops

`TOTALLY_A_REAL_COFFEE_SHOP` 
`PRETENTIOUS_COFFEE_SHOP` 
`LOCAL_WATERING_HOLE` 
`A_CAFE` 
`HIPSTER_COFFEE_SHOP` 
`COFFEE_SHOP` 
`CAFE` 
`GOURMET_COFFEE_SHOP` 
`A_LOCAL_COFFEE_SHOP`
`TEA_SHOP`

In [115]:
bank_TOTALLY_A_REAL_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'TOTALLY_A_REAL_COFFEE_SHOP']
bank_PRETENTIOUS_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'PRETENTIOUS_COFFEE_SHOP']
bank_LOCAL_WATERING_HOLE = bank[bank['to_randomly_generated_account'] == 'LOCAL_WATERING_HOLE']
bank_A_CAFE= bank[bank['to_randomly_generated_account'] == 'A_CAFE']
bank_HIPSTER_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'HIPSTER_COFFEE_SHOP']
bank_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'COFFEE_SHOP']
bank_CAFE = bank[bank['to_randomly_generated_account'] == 'CAFE']
bank_GOURMET_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'GOURMET_COFFEE_SHOP']
bank_A_LOCAL_COFFEE_SHOP = bank[bank['to_randomly_generated_account'] == 'A_LOCAL_COFFEE_SHOP']
bank_TEA_SHOP = bank[bank['to_randomly_generated_account'] == 'TEA_SHOP']

bank_CAFE.tail()    

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
10148131,43221.0,3.5,CAFE,31/12/2025,Cafe
10148139,56527.0,6.0,CAFE,31/12/2025,Cafe
10148142,61756.0,4.5,CAFE,31/12/2025,Cafe
10148150,59583.0,3.5,CAFE,31/12/2025,Cafe
10148216,3102.0,3.0,CAFE,31/12/2025,Cafe


In [116]:
cafe = pd.concat([bank_TOTALLY_A_REAL_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_PRETENTIOUS_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_LOCAL_WATERING_HOLE['monopoly_money_amount'].describe(),
                bank_A_CAFE['monopoly_money_amount'].describe(),
                bank_HIPSTER_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_CAFE['monopoly_money_amount'].describe(),
                bank_GOURMET_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_A_LOCAL_COFFEE_SHOP['monopoly_money_amount'].describe(),
                bank_TEA_SHOP['monopoly_money_amount'].describe(),], axis=1)

col_names = ['TOTALLY_A_REAL_COFFEE_SHOP', 
            'PRETENTIOUS_COFFEE_SHOP', 
            'LOCAL_WATERING_HOLE',
            'A_CAFE','HIPSTER_COFFEE_SHOP',
            'COFFEE_SHOP', 'CAFE', 'GOURMET_COFFEE_SHOP', 'A_LOCAL_COFFEE_SHOP', 'TEA_SHOP']

cafe.columns = col_names

pd.options.display.float_format = '{:.3f}'.format

cafe

Unnamed: 0,TOTALLY_A_REAL_COFFEE_SHOP,PRETENTIOUS_COFFEE_SHOP,LOCAL_WATERING_HOLE,A_CAFE,HIPSTER_COFFEE_SHOP,COFFEE_SHOP,CAFE,GOURMET_COFFEE_SHOP,A_LOCAL_COFFEE_SHOP,TEA_SHOP
count,476064.0,232872.0,649043.0,475154.0,232588.0,476099.0,111139.0,232462.0,474159.0,8811.0
mean,2.407,2.417,10.965,2.409,2.416,2.407,4.402,2.417,2.408,27.639
std,0.688,0.701,3.501,0.689,0.701,0.687,1.07,0.7,0.69,13.031
min,1.45,1.45,1.5,1.45,1.45,1.45,3.0,1.45,1.45,5.0
25%,2.15,2.15,10.0,2.15,2.15,2.15,3.5,2.15,2.15,16.36
50%,2.4,2.4,11.0,2.4,2.4,2.4,4.5,2.4,2.4,27.73
75%,2.55,2.55,13.5,2.55,2.55,2.55,5.0,2.55,2.55,39.09
max,5.3,5.3,18.0,5.3,5.3,5.3,6.0,5.3,5.3,50.0


- 'CAFE' has slightly higher mean spend compared to the other coffee shops.
- Tea shop is not an actualy tea cafe, rather a place to buy teabags or other types of tea most likely.
- Watering hole is not a type of coffee shop.

#### Investigation of coffee bean and tea shops

In [117]:
col_names = ['TEA_SHOP', 'TO_BEAN_OR_NOT_TO_BEAN','WE_HAVE_BEAN_WEIGHTING', 'ROASTERIE']

bank_TEA_SHOP = bank[bank['to_randomly_generated_account'] == 'TEA_SHOP']
bank_TO_BEAN_OR_NOT_TO_BEAN = bank[bank['to_randomly_generated_account'] == 'TO_BEAN_OR_NOT_TO_BEAN']
bank_WE_HAVE_BEAN_WEIGHTING = bank[bank['to_randomly_generated_account'] == 'WE_HAVE_BEAN_WEIGHTING']
bank_ROASTERIE = bank[bank['to_randomly_generated_account'] == 'ROASTERIE']

bank_WE_HAVE_BEAN_WEIGHTING.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
15584,39993.0,28.64,WE_HAVE_BEAN_WEIGHTING,01/01/2025,
15648,81402.0,48.64,WE_HAVE_BEAN_WEIGHTING,01/01/2025,
15711,43924.0,23.64,WE_HAVE_BEAN_WEIGHTING,01/01/2025,
15820,29798.0,29.55,WE_HAVE_BEAN_WEIGHTING,01/01/2025,
15837,88363.0,26.36,WE_HAVE_BEAN_WEIGHTING,01/01/2025,


In [118]:
coffee_tea_shop = pd.concat([bank_TEA_SHOP['monopoly_money_amount'].describe(),
                bank_TO_BEAN_OR_NOT_TO_BEAN['monopoly_money_amount'].describe(),
                bank_WE_HAVE_BEAN_WEIGHTING['monopoly_money_amount'].describe(),
                bank_ROASTERIE['monopoly_money_amount'].describe()],axis=1)

coffee_tea_shop.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
coffee_tea_shop

Unnamed: 0,TEA_SHOP,TO_BEAN_OR_NOT_TO_BEAN,WE_HAVE_BEAN_WEIGHTING,ROASTERIE
count,8811.0,8610.0,8596.0,8755.0
mean,27.639,27.459,27.617,27.478
std,13.031,13.202,13.106,13.11
min,5.0,5.0,5.0,5.0
25%,16.36,15.91,16.36,16.36
50%,27.73,27.73,27.73,27.27
75%,39.09,39.09,39.09,38.64
max,50.0,50.0,50.0,50.0


- Coffee and tea shops have the same mean and variance.
- Roasterie actually seems to be a coffee bean shop!

## Spending categories for alcohol/pubs

In [141]:
col_names = ['LOCAL_WATERING_HOLE','BAR','PUB', 
            'G&T_BAR','WINE_BAR','WHISKEY_BAR','LOCAL_PUB','COCKTAIL_BAR']

In [142]:
bank_LOCAL_WATERING_HOLE = bank[bank['to_randomly_generated_account'] == 'LOCAL_WATERING_HOLE']
bank_BAR = bank[bank['to_randomly_generated_account'] == 'BAR']
bank_PUB = bank[bank['to_randomly_generated_account'] == 'PUB']
bank_G_AND_T_BAR = bank[bank['to_randomly_generated_account'] == 'G&T_BAR']
bank_WINE_BAR = bank[bank['to_randomly_generated_account'] == 'WINE_BAR']
bank_WHISKEY_BAR = bank[bank['to_randomly_generated_account'] == 'WHISKEY_BAR']
bank_LOCAL_PUB = bank[bank['to_randomly_generated_account'] == 'LOCAL_PUB']
bank_COCKTAIL_BAR = bank[bank['to_randomly_generated_account'] == 'COCKTAIL_BAR']

bank_LOCAL_PUB.tail()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date,business_name
10147320,60555.0,29.39,LOCAL_PUB,31/12/2025,Pub
10147358,68618.0,50.82,LOCAL_PUB,31/12/2025,Pub
10147483,27082.0,19.18,LOCAL_PUB,31/12/2025,Pub
10147764,8334.0,39.59,LOCAL_PUB,31/12/2025,Pub
10148178,30161.0,18.16,LOCAL_PUB,31/12/2025,Pub


In [143]:
pub = pd.concat([bank_LOCAL_WATERING_HOLE['monopoly_money_amount'].describe(),
                bank_BAR['monopoly_money_amount'].describe(),
                bank_PUB['monopoly_money_amount'].describe(),
                bank_G_AND_T_BAR['monopoly_money_amount'].describe(),
                bank_WINE_BAR['monopoly_money_amount'].describe(),
                bank_WHISKEY_BAR['monopoly_money_amount'].describe(),
                bank_LOCAL_PUB['monopoly_money_amount'].describe(),
                bank_COCKTAIL_BAR['monopoly_money_amount'].describe()], axis=1)

pub.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
pub

Unnamed: 0,LOCAL_WATERING_HOLE,BAR,PUB,G&T_BAR,WINE_BAR,WHISKEY_BAR,LOCAL_PUB,COCKTAIL_BAR
count,649043.0,770414.0,834380.0,123219.0,121995.0,122900.0,60667.0,123076.0
mean,10.965,10.979,12.751,11.107,11.092,11.096,35.048,11.106
std,3.501,3.514,8.137,3.458,3.457,3.442,14.714,3.467
min,1.5,1.5,1.5,1.5,1.5,1.5,10.0,1.5
25%,10.0,10.0,10.0,10.0,10.0,10.0,22.24,10.0
50%,11.0,11.0,11.5,11.5,11.5,11.5,34.49,11.5
75%,13.5,13.5,14.0,13.5,13.5,13.5,47.76,13.5
max,18.0,18.0,60.0,18.0,18.0,18.0,60.0,18.0


- All bars essentially can be grouped together.
- Pubs seem to have slightly different spending statistics.
- 'PUB' seems to have slightly greater (but similar) mean spend compared to the bars but higher variance in spend and 'LOCAL_PUB' seems to have even higher mean spend and variance.

#### Spending categories for alcohol shops/stores

In [122]:
col_names = ['WHISKEY_SHOP','LIQUOR_STORE','WINE_CELLAR']

In [123]:
bank_WHISKEY_SHOP = bank[bank['to_randomly_generated_account'] == 'WHISKEY_SHOP']
bank_LIQUOR_STORE = bank[bank['to_randomly_generated_account'] == 'LIQUOR_STORE']
bank_WINE_CELLAR = bank[bank['to_randomly_generated_account'] == 'WINE_CELLAR']

In [124]:
alcohol_store = pd.concat([bank_WHISKEY_SHOP['monopoly_money_amount'].describe(),
                bank_LIQUOR_STORE['monopoly_money_amount'].describe(),
                bank_WINE_CELLAR['monopoly_money_amount'].describe()], axis=1)

alcohol_store.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
alcohol_store

Unnamed: 0,WHISKEY_SHOP,LIQUOR_STORE,WINE_CELLAR
count,8800.0,8662.0,8670.0
mean,26.66,26.551,26.821
std,21.245,20.927,21.33
min,8.99,8.99,8.99
25%,10.0,10.0,10.0
50%,19.99,19.99,19.99
75%,29.99,29.99,29.99
max,74.99,74.99,74.99


- Can clearly be seen that these 3 merchants have more or less the same spending/sales characteristics and so can be grouped together safely.

## Spending categories for restaurants and takeaways

In [125]:
col_names1 = ['SANDWICH_SHOP','STEAK_HOUSE',
            'RESTAURANT','LUNCH_PLACE','COOKSHOP', 'KEBAB_SHOP',
            'LOCAL_RESTAURANT','LUNCH_VAN' ,'ROASTERIE',
            'INDIAN_RESTAURANT', 'CHINESE_RESTAURANT',
            'RESTAURANT_VOUCHER','SEAFOOD_RESAURANT' ]

In [126]:
bank_sandwich_shop = bank[bank['to_randomly_generated_account'] == 'SANDWICH_SHOP']
bank_steak_house = bank[bank['to_randomly_generated_account'] == 'STEAK_HOUSE']
bank_restaurant = bank[bank['to_randomly_generated_account'] == 'RESTAURANT']
bank_lunch_place = bank[bank['to_randomly_generated_account'] == 'LUNCH_PLACE']
bank_cookshop = bank[bank['to_randomly_generated_account'] == 'COOKSHOP']
bank_kebab_shop = bank[bank['to_randomly_generated_account'] == 'KEBAB_SHOP']
bank_local_restaurant = bank[bank['to_randomly_generated_account'] == 'LOCAL_RESTAURANT']
bank_lunch_van = bank[bank['to_randomly_generated_account'] == 'LUNCH_VAN']
bank_roasterie = bank[bank['to_randomly_generated_account'] == 'ROASTERIE']
bank_indian_restaurant = bank[bank['to_randomly_generated_account'] == 'INDIAN_RESTAURANT']
bank_chinese_restaurant = bank[bank['to_randomly_generated_account'] == 'CHINESE_RESTAURANT']
bank_restaurant_voucher = bank[bank['to_randomly_generated_account'] == 'RESTAURANT_VOUCHER']
bank_seafood_resaurant = bank[bank['to_randomly_generated_account'] == 'SEAFOOD_RESAURANT']


In [127]:
restaurant = pd.concat([bank_sandwich_shop['monopoly_money_amount'].describe(),
                bank_steak_house['monopoly_money_amount'].describe(),
                bank_restaurant['monopoly_money_amount'].describe(),
                bank_lunch_place['monopoly_money_amount'].describe(),
                bank_cookshop['monopoly_money_amount'].describe(),
                bank_kebab_shop['monopoly_money_amount'].describe(),
                bank_local_restaurant['monopoly_money_amount'].describe(),
                bank_lunch_van['monopoly_money_amount'].describe(),
                bank_roasterie['monopoly_money_amount'].describe(),
                bank_indian_restaurant['monopoly_money_amount'].describe(),
                bank_chinese_restaurant['monopoly_money_amount'].describe(),
                bank_restaurant_voucher['monopoly_money_amount'].describe(),
                bank_seafood_resaurant['monopoly_money_amount'].describe()], axis=1)

restaurant.columns = col_names1

# reorder columns in terms of increasing mean row values
restaurant = restaurant.reindex(restaurant.mean().sort_values().index, axis=1)


pd.options.display.float_format = '{:.3f}'.format
restaurant


Unnamed: 0,STEAK_HOUSE,INDIAN_RESTAURANT,CHINESE_RESTAURANT,SEAFOOD_RESAURANT,ROASTERIE,COOKSHOP,RESTAURANT_VOUCHER,RESTAURANT,LUNCH_VAN,SANDWICH_SHOP,LUNCH_PLACE,KEBAB_SHOP,LOCAL_RESTAURANT
count,7353.0,7429.0,7468.0,7580.0,8755.0,14338.0,14627.0,68053.0,110946.0,111143.0,111305.0,111339.0,172030.0
mean,35.158,35.287,35.163,35.489,27.478,17.423,27.372,34.95,4.395,4.4,4.393,4.402,15.233
std,15.499,15.518,15.39,15.429,13.11,7.182,13.109,14.859,1.067,1.067,1.07,1.067,17.087
min,10.0,10.0,10.0,10.0,5.0,5.0,10.0,10.0,3.0,3.0,3.0,3.0,3.0
25%,22.24,22.24,22.24,22.24,16.36,11.0,15.0,22.24,3.5,3.5,3.5,3.5,3.5
50%,34.49,34.49,34.49,35.51,27.27,17.0,25.0,34.49,4.5,4.5,4.5,4.5,5.0
75%,47.76,47.76,47.76,47.76,38.64,24.0,40.0,47.76,5.0,5.0,5.0,5.0,24.29
max,117.96,120.0,120.0,120.0,50.0,30.0,50.0,120.0,6.0,6.0,6.0,6.0,60.0


- sandwich shop, lunch place, lunch van, kebab shop have a lower mean and variance in spend compared to restaurants. These places are also frequented far more and should designated as fast food.
- Steak house should included with restaurants.
- Restaurant voucher seems to have slightly lower average spend.
- Roasterie should be included in the coffee bean shop group!

In [128]:
col_names2 = ['CHINESE_TAKEAWAY','TAKEAWAY_CURRY','TAKEAWAY']

In [129]:
bank_chinese_takeaway = bank[bank['to_randomly_generated_account'] == 'CHINESE_TAKEAWAY']
bank_takeaway_curry = bank[bank['to_randomly_generated_account'] == 'TAKEAWAY_CURRY']
bank_takeaway = bank[bank['to_randomly_generated_account'] == 'TAKEAWAY']

In [130]:
takeaway = pd.concat([bank_chinese_takeaway['monopoly_money_amount'].describe(),
                bank_takeaway_curry['monopoly_money_amount'].describe(),
                bank_takeaway['monopoly_money_amount'].describe()], axis=1)

takeaway.columns = col_names2

pd.options.display.float_format = '{:.3f}'.format
takeaway

Unnamed: 0,CHINESE_TAKEAWAY,TAKEAWAY_CURRY,TAKEAWAY
count,60929.0,61485.0,60757.0
mean,35.127,35.025,35.061
std,14.747,14.673,14.718
min,10.0,10.0,10.0
25%,22.24,22.24,22.24
50%,35.51,34.49,35.51
75%,47.76,47.76,47.76
max,60.0,60.0,60.0


- Takeaways can safely be grouped together

## Spending categories for technology stores

In [131]:
col_names = ['TECH_SHOP','ELECTRONICS_SHOP','HIPSTER_ELECTRONICS_SHOP']

In [132]:
bank_tech_shop = bank[bank['to_randomly_generated_account'] == 'TECH_SHOP']
bank_electronics_shop = bank[bank['to_randomly_generated_account'] == 'ELECTRONICS_SHOP']
bank_hipster_electronics_shop = bank[bank['to_randomly_generated_account'] == 'HIPSTER_ELECTRONICS_SHOP']

In [133]:
tech_shop = pd.concat([bank_tech_shop['monopoly_money_amount'].describe(),
                bank_electronics_shop['monopoly_money_amount'].describe(),
                bank_hipster_electronics_shop['monopoly_money_amount'].describe()], axis=1)

tech_shop.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
tech_shop

Unnamed: 0,TECH_SHOP,ELECTRONICS_SHOP,HIPSTER_ELECTRONICS_SHOP
count,1569.0,1543.0,1604.0
mean,167.041,162.501,164.972
std,79.516,78.769,77.928
min,30.0,30.0,30.0
25%,98.0,94.0,98.0
50%,171.0,159.0,165.0
75%,237.0,231.0,231.0
max,300.0,299.0,300.0


- TECH_SHOP has a slightly higher average spend but lower standard deviation in spend compared
to the other two places but the spending statistics are very similar. It is therefore reasonable to group these 3 merchants together

## Spending categories for book shops

In [134]:
col_names = ['LOCAL_BOOKSHOP','COMIC_BOOK_SHOP',
            'NERDY_BOOK_STORE', 'SECOND_HAND_BOOKSHOP','BOOKSHOP']

In [135]:
bank_local_bookshop = bank[bank['to_randomly_generated_account'] == 'LOCAL_BOOKSHOP']
bank_comic_book_shop = bank[bank['to_randomly_generated_account'] == 'COMIC_BOOK_SHOP']
bank_nerdy_book_store = bank[bank['to_randomly_generated_account'] == 'NERDY_BOOK_STORE']
bank_second_hand_bookshop = bank[bank['to_randomly_generated_account'] == 'SECOND_HAND_BOOKSHOP']
bank_bookshop = bank[bank['to_randomly_generated_account'] == 'BOOKSHOP']


In [136]:
book = pd.concat([bank_local_bookshop['monopoly_money_amount'].describe(),
                bank_comic_book_shop['monopoly_money_amount'].describe(),
                bank_nerdy_book_store['monopoly_money_amount'].describe(),
                bank_second_hand_bookshop['monopoly_money_amount'].describe(),
                bank_bookshop['monopoly_money_amount'].describe()], axis=1)

book.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
book

Unnamed: 0,LOCAL_BOOKSHOP,COMIC_BOOK_SHOP,NERDY_BOOK_STORE,SECOND_HAND_BOOKSHOP,BOOKSHOP
count,26180.0,25211.0,25167.0,26147.0,26186.0
mean,12.426,10.695,10.727,12.414,12.393
std,5.011,5.842,5.819,4.962,5.012
min,5.0,2.0,2.0,5.0,5.0
25%,10.0,5.0,5.0,10.0,10.0
50%,12.0,10.0,10.0,12.0,12.0
75%,15.0,15.0,15.0,15.0,15.0
max,20.0,20.0,20.0,20.0,20.0


- Safe to group all these together

## Spending categories for toy shops/pet stuff

In [137]:
col_names = ['TOY_SHOP','PET_TOY_SHOP','PET_SHOP']

In [138]:
bank_toy_shop = bank[bank['to_randomly_generated_account'] == 'TOY_SHOP']
bank_pet_toy_shop = bank[bank['to_randomly_generated_account'] == 'PET_TOY_SHOP']
bank_pet_shop = bank[bank['to_randomly_generated_account'] == 'PET_SHOP']

In [139]:
toy_pet = pd.concat([bank_toy_shop['monopoly_money_amount'].describe(),
                bank_pet_toy_shop['monopoly_money_amount'].describe(),
                bank_pet_shop['monopoly_money_amount'].describe()], axis=1)

toy_pet.columns = col_names

pd.options.display.float_format = '{:.3f}'.format
toy_pet

Unnamed: 0,TOY_SHOP,PET_TOY_SHOP,PET_SHOP
count,16422.0,9185.0,9287.0
mean,55.433,20.179,19.962
std,25.985,16.571,16.419
min,10.0,2.0,2.0
25%,33.0,10.0,10.0
50%,56.0,15.0,15.0
75%,78.0,22.0,22.0
max,100.0,60.0,60.0


- Pet toy shop shouldn't be grouped into a toy category but instead grouped with the pet stuff.
- Toy shop has the same summary spending characteristics as the children's store so should be grouped with this.