In [6]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

In [2]:
#load data
data = pd.read_csv('fake_transactional_data_24.csv')

In [3]:
#data restructure
food_and_drinks = ['A_CAFE', 'A_LOCAL_COFFEE_SHOP', 'CAFE', 'CHINESE_RESTAURANT', 'CHINESE_TAKEAWAY', 'COFFEE_SHOP', 'GOURMET_COFFEE_SHOP', 'HIPSTER_COFFEE_SHOP', 'INDIAN_RESTAURANT', 'KEBAB_SHOP', 'LOCAL_RESTAURANT', 'LUNCH_PLACE', 'LUNCH_VAN', 'PRETENTIOUS_COFFEE_SHOP', 'RESTAURANT', 'RESTAURANT_VOUCHER', 'ROASTERIE', 'SANDWICH_SHOP', 'SEAFOOD_RESAURANT', 'STEAK_HOUSE', 'TAKEAWAY', 'TAKEAWAY_CURRY', 'TOTALLY_A_REAL_COFFEE_SHOP']
groceries_and_ds = ['A_SUPERMARKET', 'BUTCHER', 'BUTCHERS', 'DEPARTMENT_STORE', 'EXPRESS_SUPERMARKET', 'GREENGROCER', 'LARGE_SUPERMARKET', 'THE_SUPERMARKET', 'TO_BEAN_OR_NOT_TO_BEAN', 'WE_HAVE_BEAN_WEIGHTING']
clothing_and_jewellery = ['ACCESSORY_SHOP', 'CLOTHES_SHOP', 'FASHION_SHOP', 'FASHIONABLE_SPORTSWARE_SHOP', 'JEWLLERY_SHOP', 'KIDS_CLOTHING_SHOP']
outdoor = ['BAR', 'CINEMA', 'COCKTAIL_BAR', 'G&T_BAR', 'LIQUOR_STORE', 'LOCAL_PUB', 'LOCAL_WATERING_HOLE', 'PUB', 'WHISKEY_BAR', 'WHISKEY_SHOP', 'WINE_BAR', 'WINE_CELLAR']
indoor = ['BOOKSHOP', 'COMIC_BOOK_SHOP', 'DVD_SHOP', 'GAME_SHOP', 'LOCAL_BOOKSHOP', 'NERDY_BOOK_STORE', 'SECOND_HAND_BOOKSHOP', 'STREAMING_SERVICE', 'TOY_SHOP', 'VIDEO_GAME_STORE']
daily_implement = ['CHILDRENDS_SHOP', 'COOKSHOP', 'DIY_STORE', 'ELECTRONICS_SHOP', 'GYM', 'HIPSTER_ELECTRONICS_SHOP', 'HOME_IMPROVEMENT_STORE', 'KIDS_ACTIVITY_CENTRE', 'PET_SHOP', 'PET_TOY_SHOP', 'RUNNING_SHOP', 'SCHOOL_SUPPLY_STORE', 'SPORT_SHOP', 'TEA_SHOP', 'TECH_SHOP', 'TRAINER_SHOP']

In [4]:
#replace account to their type
def replace_account_names(name, a, b, c, d, e, f):
    if re.fullmatch(r'\d+', name):
        return 'transaction'
    
    elif name in a:
        return 'Food & Drink'
    
    elif name in b:
        return 'Groceries & Department Store'
    
    elif name in c:
        return 'Clothing & Jewellery'
    
    elif name in d:
        return 'Outdoor Entertainment'
    
    elif name in e:
        return 'Indoor Entertainment'
    
    elif name in f:
        return 'Daily Purchase'

    else:
        return name

#restructure
data['to_randomly_generated_account'] = data['to_randomly_generated_account'].apply(lambda x: replace_account_names(x, food_and_drinks, groceries_and_ds, clothing_and_jewellery, outdoor, indoor, daily_implement))

account_type_cost = data.groupby(['from_totally_fake_account', 'to_randomly_generated_account'])['monopoly_money_amount'].sum().reset_index()

# print(account_type_cost.head(10))

#show purchase proportion
total_spending_per_account = data.groupby('from_totally_fake_account')['monopoly_money_amount'].sum().reset_index()
total_spending_per_account.rename(columns={'monopoly_money_amount': 'total_spending'}, inplace=True)

merged_data = pd.merge(account_type_cost, total_spending_per_account, on='from_totally_fake_account')

merged_data['category_percentage'] = (merged_data['monopoly_money_amount'] / merged_data['total_spending'] * 100).round(2)

print(merged_data.head(10))



   from_totally_fake_account to_randomly_generated_account  \
0                       1000                Daily Purchase   
1                       1000                  Food & Drink   
2                       1000  Groceries & Department Store   
3                       1000          Indoor Entertainment   
4                       1000         Outdoor Entertainment   
5                       1000                   transaction   
6                       1002                Daily Purchase   
7                       1002                  Food & Drink   
8                       1002  Groceries & Department Store   
9                       1002          Indoor Entertainment   

   monopoly_money_amount  total_spending  category_percentage  
0                  54.00         2824.34                 1.91  
1                 389.21         2824.34                13.78  
2                  13.18         2824.34                 0.47  
3                  32.98         2824.34                 1.17

In [14]:
#set the decision labels
merged_data['Primary_F&D'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Food & Drink' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_DP'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Daily Purchase' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_G&DS'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Groceries & Department Store' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_C&J'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Clothing & Jewellery' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_IE'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Indoor Entertainment' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_OE'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'Outdoor Entertainment' and x['category_percentage'] > 20 else 0, axis=1)
merged_data['Primary_T'] = merged_data.apply(lambda x: 1 if x['to_randomly_generated_account'] == 'transaction' and x['category_percentage'] > 60 else 0, axis=1)

#restructure
high_ratio_features = merged_data.groupby('from_totally_fake_account').sum().reset_index()


  high_ratio_features = merged_data.groupby('from_totally_fake_account').sum().reset_index()


In [15]:
#dataset setup
X = high_ratio_features.drop(['from_totally_fake_account', 'total_spending', 'monopoly_money_amount', 'category_percentage'], axis=1)
Y = high_ratio_features[['Primary_F&D', 'Primary_DP', 'Primary_G&DS', 'Primary_C&J', 'Primary_IE', 'Primary_OE', 'Primary_T']]

#train and test dataset setup, lucky number 17
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=17)


#decision tree
dt_clf = MultiOutputClassifier(DecisionTreeClassifier(random_state=17))

#train model
dt_clf.fit(X_train, Y_train)

predictions = dt_clf.predict(X)
predictions_df = pd.DataFrame(predictions, columns=Y_test.columns)
customer_counts = predictions_df.sum().sort_values(ascending=False)

unique_customers = data['from_totally_fake_account'].nunique()

print("Unique customers:", unique_customers)
print("Customers type and sum:")
print(customer_counts)

no_category_customers = predictions_df[(predictions_df.T == 0).all()].index
print("No category customers:", len(no_category_customers))

Unique customers: 8142
Customers type and sum:
Primary_OE      4060
Primary_G&DS    2783
Primary_F&D     1154
Primary_T       1145
Primary_C&J      200
Primary_DP        61
Primary_IE         4
dtype: int64
No category customers: 1465
