In [7]:
import pandas as pd
import numpy as np

In [8]:
#print max rows
pd.set_option('display.max_rows', 500)

bank = pd.read_csv('fake_transactional_data_24.csv')

## take 0.5% sample of the data
## Only do this so you can easily view the entire 'bank' dataframe otherwise it's too big and slow to load

# bank = bank.sample(frac=0.005, random_state=1)

Can see below the 79 different vendors that are listed as various different business names in the beneficiary column

In [9]:
# assign a column to bank dataframe that stores 1 if the value in 'to_randomly_generated_account' is non-numerical and 0 otherwise
bank['test_string'] = bank['to_randomly_generated_account'].apply(lambda x: 1 if not x.isdigit() else 0)

# print unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string'] == 1]['to_randomly_generated_account'].unique())

#print number of unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string'] == 1]['to_randomly_generated_account'].nunique())

['CINEMA' 'HIPSTER_COFFEE_SHOP' 'TOTALLY_A_REAL_COFFEE_SHOP' 'COFFEE_SHOP'
 'CAFE' 'A_CAFE' 'LOCAL_RESTAURANT' 'A_LOCAL_COFFEE_SHOP'
 'GOURMET_COFFEE_SHOP' 'LOCAL_WATERING_HOLE' 'SANDWICH_SHOP' 'TOY_SHOP'
 'PRETENTIOUS_COFFEE_SHOP' 'BAR' 'PUB' 'COMIC_BOOK_SHOP' 'LUNCH_VAN'
 'DEPARTMENT_STORE' 'KEBAB_SHOP' 'WINE_BAR' 'ELECTRONICS_SHOP'
 'RESTAURANT' 'LOCAL_PUB' 'LUNCH_PLACE' 'FASHION_SHOP'
 'FASHIONABLE_SPORTSWARE_SHOP' 'SCHOOL_SUPPLY_STORE' 'LOCAL_BOOKSHOP'
 'TRAINER_SHOP' 'BOOKSHOP' 'KIDS_ACTIVITY_CENTRE' 'VIDEO_GAME_STORE'
 'CLOTHES_SHOP' 'TAKEAWAY_CURRY' 'TECH_SHOP' 'NERDY_BOOK_STORE'
 'WHISKEY_BAR' 'PET_TOY_SHOP' 'DVD_SHOP' 'CHILDRENDS_SHOP' 'GAME_SHOP'
 'INDIAN_RESTAURANT' 'COCKTAIL_BAR' 'RUNNING_SHOP' 'DIY_STORE' 'COOKSHOP'
 'HOME_IMPROVEMENT_STORE' 'PET_SHOP' 'CHINESE_TAKEAWAY' 'BUTCHERS'
 'SECOND_HAND_BOOKSHOP' 'G&T_BAR' 'GREENGROCER' 'JEWLLERY_SHOP'
 'ACCESSORY_SHOP' 'TAKEAWAY' 'KIDS_CLOTHING_SHOP' 'SPORT_SHOP'
 'STEAK_HOUSE' 'HIPSTER_ELECTRONICS_SHOP' 'CHINESE_RESTAURANT'
 'S

In [10]:
#get column names
print(bank.columns)

Index(['from_totally_fake_account', 'monopoly_money_amount',
       'to_randomly_generated_account', 'not_happened_yet_date',
       'test_string'],
      dtype='object')


In [12]:
# assign a column to bank dataframe that stores 1 if the value in 'from' is non-numerical and 0 otherwise
#bank['test_string_sender'] = bank['from_totally_fake_account'].apply(lambda x: 1 if not x.isdigit() else 0)

# only get an error if we try to convert the entire column to a string - means there are only numerical values in the column - i.e. account numbers not any businesses.
# therefore there's no business payments/refunds/salaries being paid to consumers.
bank['test_string_sender'] = bank['from_totally_fake_account'].astype(str).apply(lambda x: 1 if not x.isdigit() else 0)

# print unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string_sender'] == 1]['from_totally_fake_account'].unique())

#print number of unique values in 'to_randomly_generated_account' column where 'test_string' is 1
print(bank[bank['test_string_sender'] == 1]['from_totally_fake_account'].nunique())

[10371. 88339. 18555. ... 30084. 21021. 22429.]
8142


Have 8142 unique accounts

### Proposed categories:

- `Alcohol shop`
- `Coffee`
- `Restaurant`
- `Entertainment` - cinemas and online streaming
- `Home Goods`
- `Supermarkets`
- `Apparrel/Clothes`
- `Book stores`
- `Gym`  
- `Pub/bars`
- `Takeaway`
- `Sports shops`
- `Children`
- `Pet shops`   
- `Technology stores`
- `Greengrocer`
- `Butcher`
- `Game/DVD store`

In [27]:
# regex search patterns
val_replace_cafe = ['coffee', 'tea', 'cafe']
val_replace_book_store = ['book', 'book store', 'bookshop']  
val_replace_alcohol = ['WINE_CELLAR', 'alcohol', 'liquor','whiskey']
val_replace_pub = ['bar','pub','LOCAL_WATERING_HOLE']
val_replace_restaurant = ['restaurant','SEAFOOD_RESAURANT' , 'steak house', 'KEBAB_SHOP', 'SANDWICH_SHOP', 'roasterie','lunch']
val_replace_entertainment = ['cinema','streaming']
val_replace_home = ['DIY', 'home', 'COOKSHOP']
val_replace_supermarket = ['supermarket']
val_replace_green_grocer = ['GREENGROCER']
val_replace_clothing = ['clothing', 'fashion', 'clothes', 'ACCESSORY_SHOP']
val_replace_gym = ['gym']
val_replace_takeaway = ['takeaway']
val_replace_sports_store = ['sport','sportsware','running','trainer','FASHIONABLE_SPORTSWARE_SHOP']
val_replace_children = ['child','toy', 'kids', 'CHILDRENDS_SHOP']
val_replace_pets = ['pet', 'PET_TOY_SHOP']
val_replace_tech_store = ['tech','electronics']
val_replace_butcher = ['BUTCHER', 'TURKEY_FARM']
val_replace_game = ['game', 'DVD']
val_replace_jeweller = ['JEWLLERY_SHOP']

# replacement group categories
merchant_names = ['Cafe', 'Book Store','Alcohol', 'Pub', 'Restaurant', 'Entertainment', 'Home',
                  'Supermarket', 'Green Grocer', 'Clothing', 'Gym', 'Takeaway', 'Sports Store', 'Children','Pets', 'Tech Store', 'Butcher', 'Game', 'Jewellery']

In [28]:
grouping_conditions = [
    
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_cafe), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_book_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_alcohol), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_pub), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_restaurant), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_entertainment), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_home), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_supermarket), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_green_grocer), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_clothing), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_gym), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_takeaway), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_sports_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_children), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_pets), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_tech_store), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_butcher), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_game), case=False, regex=True),
    bank['to_randomly_generated_account'].str.contains('|'.join(val_replace_jeweller), case=False, regex=True)

]


In [29]:
# assign new merchant categories
bank['business_name'] = np.select(grouping_conditions, merchant_names, default='')

If you now look inside the `bank` dataframe - you can see another column in the dataframe which categorises the different businesses where money has been spent at.