## Sorting the new data into spending sectors

In [2]:
import pandas as pd
import numpy as np

In [67]:
transaction_df = pd.read_csv('C:/Users/camer/OneDrive/Documents/Data Science MSc/DSMP/simulated_transaction_2024.csv')
transaction_df

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name
0,01/01/2023,00:00,678330503.0,2971.000000,1584.00,,Westport Care Home
1,01/01/2023,00:00,472213568.0,3792.000000,1950.00,,Barbiee Boutique
2,01/01/2023,00:00,472213568.0,3012.000000,-780.00,283027736.0,
3,01/01/2023,00:00,283027736.0,1787.000000,780.00,472213568.0,
4,01/01/2023,00:00,624500124.0,3226.000000,1825.00,,Fat Face
...,...,...,...,...,...,...,...
230591,06/12/2023,20:54,581655972.0,45935.206861,-41.06,,Tesco
230592,06/12/2023,20:55,786141370.0,-244.837500,-62.35,,Sainsbury Local
230593,06/12/2023,21:05,824916823.0,9709.172159,-32.94,,Deliveroo
230594,06/12/2023,21:13,366550080.0,26834.165794,-19.25,,Amazon


Selecting only rows with Third party names

In [68]:
business_transactions = transaction_df[~transaction_df['Third Party Name'].isnull()]
business_transactions = business_transactions.drop(['Third Party Account No', 'Balance', 'Timestamp', 'Date'], axis=1)
print(f"{business_transactions['Third Party Name'].value_counts()}")
print(f"\n\n Names of third party accounts: \n{business_transactions['Third Party Name'].unique()}")

Third Party Name
Tesco                 34108
Sports Direct         16259
Topshop               10332
Fat Face               9543
PureGym                8293
                      ...  
Millets                   1
Gap Kids                  1
Specsavers                1
RugbyFields               1
Mountain Warehouse        1
Name: count, Length: 83, dtype: int64


 Names of third party accounts: 
['Westport Care Home' 'Barbiee Boutique' 'Fat Face' 'Lavender Primary'
 'A Cut Above' 'Tesco' 'Brilliant Brushes' 'Costa Coffee'
 'Premier Finance' 'North Face' 'Craftastic' 'Fitted Stitch' 'Selfridges'
 'Starbucks' 'The Crown' 'Cass Art' 'Green Park Academy' 'Rose & Crown'
 'Kings Arms' 'Sunny Care Nursery' 'A Yarn Story' 'Pets Corner'
 'Lavender Fields' 'Victoria Park' 'Halifax' 'LBG' 'Blizzard' 'Xbox'
 'Mojang Studios' 'PureGym' 'Disney' 'Netflix' 'Grand Union BJJ' 'Amazon'
 'SquareOnix' 'Deliveroo' 'JustEat' 'Coop Local' 'Sainsbury' 'AMAZON'
 'The Works' "Blackwell's" 'Topshop' 'Matalan' 'Fo

In [69]:
null_values = business_transactions[business_transactions.isnull().any(axis=1)]
null_values

Unnamed: 0,Account No,Amount,Third Party Name
527,,-1143.00,Halifax
949,857754342.0,,LBG
1295,,-18.99,PureGym
1335,,-18.99,PureGym
1563,,-15.99,Netflix
...,...,...,...
227463,460463595.0,,Tesco
227645,587225252.0,,Sainsbury Local
229119,,-225.37,Topshop
229970,497772347.0,,Matalan


The Null values left in the data can be seen above, this is ontl 420 out of 230000 rows so they can be dropped with minimal effect

In [70]:
business_transactions = business_transactions.dropna()
business_transactions

Unnamed: 0,Account No,Amount,Third Party Name
0,678330503.0,1584.00,Westport Care Home
1,472213568.0,1950.00,Barbiee Boutique
4,624500124.0,1825.00,Fat Face
5,203466392.0,2841.66,Lavender Primary
6,768271776.0,1950.00,A Cut Above
...,...,...,...
230591,581655972.0,-41.06,Tesco
230592,786141370.0,-62.35,Sainsbury Local
230593,824916823.0,-32.94,Deliveroo
230594,366550080.0,-19.25,Amazon


Some of the transactions are positive, meaning people are being paid, this analysis focusing on spending so these transactions need to be removed

In [71]:
business_spending = business_transactions[business_transactions['Amount'] <= 0]
business_spending

Unnamed: 0,Account No,Amount,Third Party Name
495,355733816.0,-664.00,Halifax
496,472213568.0,-600.00,LBG
497,624500124.0,-654.00,LBG
498,203466392.0,-672.00,Halifax
499,768271776.0,-632.00,LBG
...,...,...,...
230591,581655972.0,-41.06,Tesco
230592,786141370.0,-62.35,Sainsbury Local
230593,824916823.0,-32.94,Deliveroo
230594,366550080.0,-19.25,Amazon


In [72]:
business_spending['Account No'].unique()

array([3.55733816e+08, 4.72213568e+08, 6.24500124e+08, 2.03466392e+08,
       7.68271776e+08, 5.64744955e+08, 6.75806859e+08, 4.56221621e+08,
       3.50921975e+08, 3.31450202e+08, 4.55831435e+08, 8.32564361e+08,
       1.72084472e+08, 5.50169791e+08, 1.41521177e+08, 4.42070890e+08,
       7.44968736e+08, 8.65748375e+08, 5.22188082e+08, 4.73820682e+08,
       7.86141370e+08, 8.12839908e+08, 5.03575533e+08, 9.79135922e+08,
       1.34461671e+08, 4.30910686e+08, 6.18514666e+08, 3.61201224e+08,
       3.70757362e+08, 6.37433131e+08, 7.54141345e+08, 5.59995067e+08,
       1.19993184e+08, 4.78986718e+08, 2.98000165e+08, 4.16060965e+08,
       6.73025413e+08, 2.41459011e+08, 4.03902116e+08, 6.04288474e+08,
       6.56781737e+08, 8.71866387e+08, 4.25964226e+08, 2.00080192e+08,
       5.19649533e+08, 1.83888402e+08, 6.80773155e+08, 6.41764023e+08,
       2.15426302e+08, 3.86729293e+08, 2.85428103e+08, 9.25037906e+08,
       8.76890798e+08, 8.97714622e+08, 1.20634201e+08, 8.92354433e+08,
      

In [79]:
print(f"\n\n Names of third party accounts: \n{business_spending['Third Party Name'].unique()}")



 Names of third party accounts: 
['Halifax' 'LBG' 'Blizzard' 'Xbox' 'Mojang Studios' 'PureGym' 'Disney'
 'Netflix' 'Grand Union BJJ' 'Amazon' 'SquareOnix' 'Deliveroo' 'JustEat'
 'Coop Local' 'Selfridges' 'Sainsbury' 'AMAZON' 'The Works' "Blackwell's"
 'Fat Face' 'Topshop' 'Matalan' 'Foyles' 'Tesco' 'Wool' 'Hobby Lobby'
 'Revella' 'Sainsbury Local' 'Starbucks' 'Loosely Fitted'
 'Stitch By Stitch' 'Coffee #1' 'Hobbycraft' 'A Yarn Story' 'Craftastic'
 'Kings Arms' 'Costa Coffee' 'The Crown' 'Lloyds Pharmacy' 'Rose & Crown'
 'Fitted Stitch' 'Gamestation' 'CeX' 'Etsy' 'Five Senses Art'
 'Sports Direct' 'Cass Art' 'Brilliant Brushes' 'Boots' 'Reebok'
 'JD Sports' 'Head' 'Frankie & Bennies' 'Gap Kids' 'North Face'
 'Town High' 'Collector Cave' 'Barbiee Boutique' 'RugbyFields'
 'Mamas & Papas' 'Lavender Primary' 'Remedy plus care' 'Specsavers'
 'Kew House' 'HMV' 'Vision Express' 'Millets' 'Pets Corner' 'Mothercare'
 'A Cut Above' 'Happy Days Home' 'Mountain Warehouse' 'Victoria Park'
 'Unive

Listing cateogries to split the third party accounts into

In [None]:
categories = {
    'Financial Services': ['Halifax', 'LBG'],
    'Gaming': ['Blizzard', 'Xbox', 'Mojang Studios', 'Gamestation'],
    'Streaming Services': ['Netflix', 'Disney'],
    'Misc/General Retail': ['Etsy', 'Amazon', 'AMAZON', 'Selfridges', 'The Works', 'CeX', 'HMV'],
    'Sport/Fitness': ['Sports Direct', 'PureGym', 'Grand Union BJJ', 'Reebok', 'JD Sports', 'Head', 'Mountain Warehouse', 'Millets'],
    'Food Delivery': ['Deliveroo', 'JustEat'],
    'Supermarket': ['Coop Local', 'Tesco', 'Sainsbury', 'Sainsbury Local'],
    'Hospitality': ['Kings Arms', 'The Crown', 'Rose & Crown', 'Frankie & Bennies'],
    'Art/Hobby Supplies': ['Collector Cave', 'Craftastic', 'Cass Art', 'Hobbycraft', 'Wool', 'A Yarn Story', 'Hobby Lobby', 'Five Senses Art', 'Brilliant Brushes'],
    'Pet Supplies': ['Pets Corner'],
    'Bookshop': ["Blackwell's", 'Foyles'],
    'Children': ['Lavender Primary', 'Gap Kids', 'Mamas & Papas', 'Mothercare', 'Kew House']
    'Fashion': ['Fat Face', 'Topshop', 'Matalan', 'Revella', 'Fitted Stitch', 'Loosely Fitted', 'Stitch By Stitch', 'North Face', 'Barbiee Boutique'],
    'Coffee': ['Costa Coffee', 'Starbucks', 'Coffee #1'],
    'Healthcare': ['University College Hospital','Specsavers' ,'Vision Express' , 'Lloyds Pharmacy', 'Boots', 'Remedy plus care', 'Happy Days Home'],
}






Some of the tird party names do not fit into any categories/ it is not obvious what they represent, 
'Town High'
'Victoria Park'
'A Cut Above'
these all have 2 or less transactions so they can just be removed

In [110]:
values_to_drop = ['Town High', 'Victoria Park', 'A Cut Above']
business_spending = business_spending[~business_spending['Third Party Name'].isin(values_to_drop)]


In [113]:
businesses = business_spending['Third Party Name'].value_counts()
businesses.head(60)


Third Party Name
Tesco                33871
Sports Direct        16237
Topshop              10311
Fat Face              9303
PureGym               8278
Matalan               7837
Netflix               7767
Amazon                7563
JustEat               7145
Deliveroo             6961
The Works             6692
Five Senses Art       5948
Revella               5352
Starbucks             4817
Blizzard              4432
SquareOnix            4375
Xbox                  4365
Mojang Studios        4364
Disney                3948
Blackwell's           3844
Loosely Fitted        3711
Grand Union BJJ       3653
Sainsbury Local       3154
Sainsbury             3117
Halifax               3048
Selfridges            3041
LBG                   2958
Fitted Stitch         2921
Gamestation           2821
Coop Local            2605
Coffee #1             2295
Boots                 2045
AMAZON                2024
Wool                  1995
Lloyds Pharmacy       1995
Etsy                  1971
Reebok     