## Import Libraries

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

## Loading the Dataset

In [2]:
# Loading in and setting the data
# You can't download these files from the github, you have to drop them in the folder
tx_data = pd.read_csv('fake_transactional_data_24.csv')
tx_df = pd.DataFrame(tx_data)

In [3]:
tx_df.head()

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date
0,10371.0,4.0,CINEMA,01/01/2025
1,88339.0,2.4,40544,01/01/2025
2,18555.0,2.4,85149,01/01/2025
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025
4,80792.0,1.95,18555,01/01/2025


In [4]:
print(len(tx_df))

10148280


### Creating Senders and Receivers List

In [5]:
# Creating senders and receivers list
senders_list = sorted(tx_df['from_totally_fake_account'])
receivers_list = sorted(tx_df['to_randomly_generated_account'])

print(tx_df['to_randomly_generated_account'])

0                        CINEMA
1                         40544
2                         85149
3           HIPSTER_COFFEE_SHOP
4                         18555
                   ...         
10148275            COFFEE_SHOP
10148276            COFFEE_SHOP
10148277               WINE_BAR
10148278                  57569
10148279    A_LOCAL_COFFEE_SHOP
Name: to_randomly_generated_account, Length: 10148280, dtype: object


## Business Data Extraction and Organization

In [6]:
# Creating business and users list
receivers_biz_list = []
receivers_user_list = []

for receiver in receivers_list:
    if receiver.isnumeric() == True:
        receivers_user_list.append(receiver)
    else:
        receivers_biz_list.append(receiver)

In [7]:
unique_senders_list = sorted(list(set(senders_list)))
unique_receivers_list = sorted(list(set(receivers_list)))
unique_receivers_biz_list = sorted(list(set(receivers_biz_list)))
unique_receivers_user_list = sorted(list(set(receivers_user_list)))

### Extracting Valuable Business Data

In [8]:
# Creating business table
# Pruning data from original dataframe
biz_tx_df = tx_df[tx_df['to_randomly_generated_account'].isin(unique_receivers_biz_list)]
biz_tx_df = biz_tx_df.sort_values('to_randomly_generated_account')

# Getting list of ordered names
biz_name_list = list(biz_tx_df['to_randomly_generated_account'].unique())

In [9]:
print(len(tx_df))

10148280


In [10]:
# Getting Descriptive Stats
biz_tx_count = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].count()
biz_tx_sum = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].sum()

biz_tx_mean = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].mean()
biz_tx_median = biz_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].median()

In [11]:
# Needed to count words
from collections import Counter

# Custom aggregation function for sorted list
def sorted_list_agg(input_list):
    return sorted(input_list)

# Custom aggregation function for sorted unique list
def sorted_unique_list_agg(input_list):
    return sorted(input_list.unique())

def sorted_list_count_agg(input_list):
    # Sort the list
    input_list = sorted(input_list)
    
    # Count the frequencies
    input_word_freqs = Counter(input_list)
    
    # Generate the output list
    # dict.fromkeys iterates over the words in the order they occur, it ensures the words remain in order
    output_list = [input_word_freqs[word] for word in dict.fromkeys(input_list)]
    
    return output_list

In [12]:
# Getting Business Sorted Customer Details
biz_tx_customers_sorted = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_agg)

# Getting Business Unique Sorted Customer Details
biz_tx_customers_sorted_unique = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_unique_list_agg)

# Getting Business Sorted Customer Details Count
biz_tx_customers_sorted_unique_count = biz_tx_df.groupby('to_randomly_generated_account')['from_totally_fake_account'].agg(sorted_list_count_agg)

### Building Business Dataframes

In [13]:
# Building business accounts table
biz_slim_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median)
}

# Building business accounts table
biz_large_data = {
    'name': biz_name_list,
    'tx_count': list(biz_tx_count),
    'tx_sum_gbp': list(biz_tx_sum),
    'tx_mean_gbp': list(biz_tx_mean),
    'tx_median_gbp': list(biz_tx_median),
    'customers_sorted_unique': list(biz_tx_customers_sorted_unique),
    'customers_tx_count': list(biz_tx_customers_sorted_unique_count)
}

# Create small dataframe
biz_slim_df = pd.DataFrame(biz_slim_data)

# Create large dataframe
biz_large_df = pd.DataFrame(biz_large_data)

### Categorizing Industries by Data

In [14]:
### Importing Mapping
from industryMap import industry_categories

# Using the industry_categories dictionary, map each business in the DataFrame to its industry category
biz_large_df['industry_category'] = biz_large_df['name'].map(industry_categories)

In [15]:
biz_large_df.head()

Unnamed: 0,name,tx_count,tx_sum_gbp,tx_mean_gbp,tx_median_gbp,customers_sorted_unique,customers_tx_count,industry_category
0,ACCESSORY_SHOP,3325,74862.0,22.514887,22.0,"[1357.0, 1607.0, 1623.0, 1828.0, 1931.0, 2090....","[6, 10, 10, 5, 14, 6, 6, 7, 7, 5, 11, 9, 9, 9,...",Accessories
1,A_CAFE,475154,1144734.25,2.409186,2.4,"[1000.0, 1002.0, 1006.0, 1053.0, 1059.0, 1071....","[116, 53, 118, 55, 45, 55, 65, 57, 143, 50, 58...",Cafes
2,A_LOCAL_COFFEE_SHOP,474159,1141932.3,2.408332,2.4,"[1000.0, 1002.0, 1006.0, 1018.0, 1053.0, 1059....","[117, 46, 137, 4, 55, 40, 53, 55, 46, 130, 54,...",Cafes
3,A_SUPERMARKET,81335,5652778.58,69.499952,65.22,"[1000.0, 1002.0, 1006.0, 1018.0, 1053.0, 1059....","[2, 5, 16, 15, 3, 5, 4, 18, 16, 10, 11, 7, 4, ...",Supermarkets
4,BAR,770414,8458081.5,10.978619,11.0,"[1000.0, 1002.0, 1006.0, 1018.0, 1053.0, 1059....","[72, 110, 104, 110, 98, 82, 117, 71, 139, 65, ...",Pubs / Bars


In [16]:
# Saves the dataframes to csv files, in the specific folder
biz_slim_df.to_csv('SavedData/biz_slim.csv', index=False)
biz_large_df.to_csv('SavedData/biz_large.csv', index=False)

## User Data Extraction and Organization

In [17]:
# User List
# user_id X

# total_tx_count X
# gross_tx_sum_gbp X

# sent_tx_count X
# gross_tx_sent_sum_gbp X
# gross_tx_sent_mean X
# gross_tx_sent_med X

# receive_tx_count X
# gross_tx_receive_sum_gbp X
# gross_tx_receive_mean X
# gross_tx_receive_med X

# net_tx_count X
# net_tx_sum_gbp X

### Create User ID List

In [18]:
# Combine the unique senders, and unique customer receivers
customer_list_combination = unique_senders_list + unique_receivers_user_list

# Convert all items to strings of integers
# Turned into a set as this removed duplicates
# Values are sorted alphabetically
customer_unique_list = sorted(set([str(int(cust)) for cust in customer_list_combination]))

### Create Send Tx Count List

In [19]:
# Dictionary featuring all the unique customer id's
user_send_tx_count_dict = dict.fromkeys(customer_unique_list, 0)

# User send tx's, first converted to int to get rid of decimals, then converted to string
user_send_tx_count = tx_df['from_totally_fake_account'].astype(int).astype(str).value_counts()

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in user_send_tx_count:
        user_send_tx_count_dict[user] = user_send_tx_count[user]

# Converting this to a list
user_send_tx_count_list = list(user_send_tx_count_dict.values())

In [20]:
# User id checker, for send tx count
user_id_tx_checker = '1000' # Have to set to String to see count, have to set integer to see value
spec_user_send_tx_count = user_send_tx_count.get(user_id_tx_checker, 0)
#print(spec_user_send_tx_count)

# Ensuring value is or isn't in list
#print(user_id_tx_checker in unique_senders_list)

### Create Receive Tx Count List

In [21]:
# Dictionary featuring all the unique customer id's
user_receive_tx_count_dict = dict.fromkeys(customer_unique_list, 0)

# Creating tx_df that features no company names
user_tx_df = tx_df[~tx_df['to_randomly_generated_account'].isin(biz_name_list)]

# User send tx's, first converted to int to get rid of decimals, then converted to string
user_receive_tx_count = user_tx_df['to_randomly_generated_account'].astype(int).astype(str).value_counts()

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in user_receive_tx_count:
        user_receive_tx_count_dict[user] = user_receive_tx_count[user]

# Converting this to a list
user_received_tx_count_list = list(user_receive_tx_count_dict.values())

In [22]:
# User id checker, for send tx count
user_id_tx_checker = 100384 # Have to set to String to see count, have to set integer to see value
spec_receive_send_tx_count = user_receive_tx_count.get(user_id_tx_checker, 0)
#print(spec_receive_send_tx_count)

# Ensuring value is or isn't in list
#print(user_id_tx_checker in unique_receivers_list)

### Create Total Tx Count List

In [23]:
user_total_tx_count_dict = {}

for user in user_send_tx_count_dict:
    user_total_tx_count_dict[user] =  user_send_tx_count_dict[user] + user_receive_tx_count_dict[user]

user_total_tx_count_list = list(user_total_tx_count_dict.values())

### Create Net Tx Count List

In [24]:
user_net_tx_count_dict = {}

for user in user_send_tx_count_dict:
    user_net_tx_count_dict[user] =  user_send_tx_count_dict[user] - user_receive_tx_count_dict[user]

user_net_tx_count_list = list(user_net_tx_count_dict.values())

### Gross Tx Sent Sum GBP

In [25]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].sum()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent.index = grouped_user_id_sent.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_sum_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent:
        user_sent_tx_sum_dict[user] = grouped_user_id_sent[user]

# Converting this to a list
user_sent_tx_sum_list = list(user_sent_tx_sum_dict.values())

### Gross Tx Receive Sum GBP

In [26]:
# Gets the tx_df, filters the dataframe for only rows with business accounts, and putting '~' in front does the reverse
user_tx_df = tx_df[~tx_df['to_randomly_generated_account'].isin(biz_name_list)]

# Same piece of code used in previous cell, but now finding received money
grouped_user_id_received = user_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].sum()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received.index = grouped_user_id_received.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_sum_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received:
        user_received_tx_sum_dict[user] = grouped_user_id_received[user]

# Converting this to a list
user_received_tx_sum_list = list(user_received_tx_sum_dict.values())

### Gross Tx Sum GBP

In [27]:
user_total_tx_sum_dict = {}

for user in user_sent_tx_sum_dict:
    user_total_tx_sum_dict[user] =  user_sent_tx_sum_dict[user] + user_received_tx_sum_dict[user]

user_total_tx_sum_list = list(user_total_tx_sum_dict.values())
#print(user_total_tx_sum_list)

### Net Tx Sum GBP

In [28]:
user_net_tx_sum_dict = {}

for user in user_sent_tx_sum_dict:
    user_net_tx_sum_dict[user] =  user_sent_tx_sum_dict[user] - user_received_tx_sum_dict[user]

user_net_tx_sum_list = list(user_net_tx_sum_dict.values())
#print(user_net_tx_sum_list)

### Gross Tx Sent Mean GBP

In [33]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent_mean = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].mean()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent_mean.index = grouped_user_id_sent_mean.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_mean_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent_mean:
        user_sent_tx_mean_dict[user] = grouped_user_id_sent_mean[user]

# Converting this to a list
user_sent_tx_mean_list = list(user_sent_tx_mean_dict.values())

### Gross Tx Sent Median GBP

In [37]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_sent_median = tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].median()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_sent_median.index = grouped_user_id_sent_median.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_sent_tx_median_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_sent_median:
        user_sent_tx_median_dict[user] = grouped_user_id_sent_median[user]

# Converting this to a list
user_sent_tx_median_list = list(user_sent_tx_median_dict.values())

### Gross Tx Received Mean GBP

In [38]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_received_mean = user_tx_df.groupby('to_randomly_generated_account')['monopoly_money_amount'].mean()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received_mean.index = grouped_user_id_received_mean.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_mean_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received_mean:
        user_received_tx_mean_dict[user] = grouped_user_id_received_mean[user]

# Converting this to a list
user_received_tx_mean_list = list(user_received_tx_mean_dict.values())

### Gross Tx Received Mean GBP

In [39]:
# Groups the user with all the entries in which they have sent money
# The ".sum()" adds up all these entries
grouped_user_id_received_median = user_tx_df.groupby('from_totally_fake_account')['monopoly_money_amount'].median()

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_received_median.index = grouped_user_id_received_median.index.astype(int).astype(str)

# Dictionary featuring all the unique customer id's
user_received_tx_median_dict = dict.fromkeys(customer_unique_list, 0)

# Setting these values to the values in the dictionary
for user in customer_unique_list:
    if user in grouped_user_id_received_median:
        user_received_tx_median_dict[user] = grouped_user_id_received_median[user]

# Converting this to a list
user_received_tx_median_list = list(user_received_tx_median_dict.values())

### User Business Interaction Lists

In [40]:
# Businesses users interacted with
# Dictionary featuring all the unique customer id's
user_biz_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_id_biz_interaction = biz_tx_df.groupby('from_totally_fake_account')['to_randomly_generated_account'].agg(lambda x: sorted(list(set(x))))

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_biz_interaction.index = grouped_user_id_biz_interaction.index.astype(int).astype(str)

grouped_user_id_biz_interaction_dict = grouped_user_id_biz_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_id_biz_interaction_dict:
    if user in user_biz_interaction_dict:
        user_biz_interaction_dict[user] = grouped_user_id_biz_interaction_dict[user]
        
# Converting this to a list
user_biz_interaction_list = list(user_biz_interaction_dict.values())

### User Business Interaction Frequency Lists

In [41]:
# Businesses users interacted with frequency
# Dictionary featuring all the unique customer id's
user_biz_freq_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_id_biz_freq_interaction = biz_tx_df.groupby('from_totally_fake_account')['to_randomly_generated_account'].agg(lambda x: sorted(list(x)))

# Convert the user id to int to get rid of zero, then to string
grouped_user_id_biz_freq_interaction.index = grouped_user_id_biz_freq_interaction.index.astype(int).astype(str)

grouped_user_id_biz_freq_interaction = grouped_user_id_biz_freq_interaction.apply(sorted_list_count_agg)

grouped_user_id_biz_freq_interaction_dict = grouped_user_id_biz_freq_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_id_biz_freq_interaction_dict:
    if user in user_biz_freq_interaction_dict:
        user_biz_freq_interaction_dict[user] = grouped_user_id_biz_freq_interaction_dict[user]
        
# Converting this to a list
user_biz_freq_interaction_list = list(user_biz_freq_interaction_dict.values())

### User Business Spend Lists

In [42]:
# Dictionary featuring all the unique customer id's
user_biz_spend_dict = {user: [] for user in customer_unique_list}

# Businesses users interacted with spend
grouped_user_id_biz_spend = biz_tx_df.groupby(['from_totally_fake_account', 'to_randomly_generated_account'])['monopoly_money_amount'].sum()

# Changes index from user_id to a set of numbers, user_id (from_totally_fake_account) becomes a column
grouped_user_id_biz_spend = grouped_user_id_biz_spend.reset_index()

# Sorting 'from_totally_fake_account' and 'to_randomly_generated_account' alphabetically
# This may not change anything, but ensures order is kept
sorted_grouped_user_id_biz_spend = grouped_user_id_biz_spend.sort_values(by=['from_totally_fake_account', 'to_randomly_generated_account'])

# Convert the user id to int to get rid of zero, then to string
sorted_grouped_user_id_biz_spend['from_totally_fake_account'] = sorted_grouped_user_id_biz_spend['from_totally_fake_account'].astype(int).astype(str)

# Drop the 'to_randomly_generated_account' column
sorted_grouped_user_id_biz_spend = sorted_grouped_user_id_biz_spend.drop(columns=['to_randomly_generated_account'])

sorted_grouped_lists_user_id_biz_spend = sorted_grouped_user_id_biz_spend.groupby('from_totally_fake_account')['monopoly_money_amount'].agg(lambda x: list(x))

sorted_grouped_lists_user_id_biz_spend_dict = sorted_grouped_lists_user_id_biz_spend.to_dict()

# Setting these values to the values in the dictionary
for user in sorted_grouped_lists_user_id_biz_spend_dict:
    if user in user_biz_spend_dict:
        user_biz_spend_dict[user] = sorted_grouped_lists_user_id_biz_spend_dict[user]
        
# Converting this to a list
user_biz_spend_list = list(user_biz_spend_dict.values())

### User User Interaction Lists

In [43]:
# Users users interacted with frequency
# Dictionary featuring all the unique customer id's
user_user_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_user_interaction = user_tx_df.groupby('from_totally_fake_account')['to_randomly_generated_account'].agg(lambda x: sorted([str(int(item)) for item in set(x)]))

# Convert the user id to int to get rid of zero, then to string
grouped_user_user_interaction.index = grouped_user_user_interaction.index.astype(int).astype(str)

grouped_user_user_interaction_dict = grouped_user_user_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_user_interaction_dict:
    if user in user_user_interaction_dict:
        user_user_interaction_dict[user] = grouped_user_user_interaction_dict[user]
        
# Converting this to a list
user_user_interaction_list = list(user_user_interaction_dict.values())

### User User Interaction Frequency Lists

In [44]:
# Businesses users interacted with frequency
# Dictionary featuring all the unique customer id's
user_user_freq_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_user_freq_interaction = user_tx_df.groupby('from_totally_fake_account')['to_randomly_generated_account'].agg(lambda x: sorted(list(x)))

# Convert the user id to int to get rid of zero, then to string
grouped_user_user_freq_interaction.index = grouped_user_user_freq_interaction.index.astype(int).astype(str)

grouped_user_user_freq_interaction = grouped_user_user_freq_interaction.apply(sorted_list_count_agg)

grouped_user_user_freq_interaction_dict = grouped_user_user_freq_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_user_freq_interaction_dict:
    if user in user_user_freq_interaction_dict:
        user_user_freq_interaction_dict[user] = grouped_user_user_freq_interaction_dict[user]
        
# Converting this to a list
user_user_freq_interaction_list = list(user_user_freq_interaction_dict.values())

### User User Spend Lists

In [45]:
# Dictionary featuring all the unique customer id's
user_user_spend_dict = {user: [] for user in customer_unique_list}

# Businesses users interacted with spend
grouped_user_id_user_spend = user_tx_df.groupby(['from_totally_fake_account', 'to_randomly_generated_account'])['monopoly_money_amount'].sum()

# Changes index from user_id to a set of numbers, user_id (from_totally_fake_account) becomes a column
grouped_user_id_user_spend = grouped_user_id_user_spend.reset_index()

# Sorting 'from_totally_fake_account' and 'to_randomly_generated_account' alphabetically
# This may not change anything, but ensures order is kept
sorted_grouped_user_id_user_spend = grouped_user_id_user_spend.sort_values(by=['from_totally_fake_account', 'to_randomly_generated_account'])

# Convert the user id to int to get rid of zero, then to string
sorted_grouped_user_id_user_spend['from_totally_fake_account'] = sorted_grouped_user_id_user_spend['from_totally_fake_account'].astype(int).astype(str)

# Drop the 'to_randomly_generated_account' column
sorted_grouped_user_id_user_spend = sorted_grouped_user_id_user_spend.drop(columns=['to_randomly_generated_account'])

sorted_grouped_lists_user_id_user_spend = sorted_grouped_user_id_user_spend.groupby('from_totally_fake_account')['monopoly_money_amount'].agg(lambda x: list(x))

sorted_grouped_lists_user_id_user_spend_dict = sorted_grouped_lists_user_id_user_spend.to_dict()

# Setting these values to the values in the dictionary
for user in sorted_grouped_lists_user_id_user_spend_dict:
    if user in user_user_spend_dict:
        user_user_spend_dict[user] = sorted_grouped_lists_user_id_user_spend_dict[user]
        
# Converting this to a list
user_user_spend_list = list(user_user_spend_dict.values())

### User Industry Interaction Lists

In [46]:
# Add industry column to tx_df
def tx_df_industry_mapping(key):
    if key in industry_categories:
        return industry_categories[key]
    else:  # All values not in industry_categories are set to user
        return 'User'
    
# Apply the mapping
tx_df['industry_category'] = tx_df['to_randomly_generated_account'].apply(tx_df_industry_mapping)

In [47]:
# Industries users interacted with
# Dictionary featuring all the unique customer id's
user_industry_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_industry_interaction = tx_df.groupby('from_totally_fake_account')['industry_category'].agg(lambda x: sorted(list(set(x))))

# Convert the user id to int to get rid of zero, then to string
grouped_user_industry_interaction.index = grouped_user_industry_interaction.index.astype(int).astype(str)

grouped_user_industry_interaction_dict = grouped_user_industry_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_industry_interaction_dict:
    if user in user_industry_interaction_dict:
        user_industry_interaction_dict[user] = grouped_user_industry_interaction_dict[user]
        
# Converting this to a list
user_industry_interaction_list = list(user_industry_interaction_dict.values())

### User Industry Interaction Frequency Lists

In [48]:
# Indsutries users interacted with frequency
# Dictionary featuring all the unique customer id's
user_industry_freq_interaction_dict = {user: [] for user in customer_unique_list}

grouped_user_industry_freq_interaction = tx_df.groupby('from_totally_fake_account')['industry_category'].agg(lambda x: sorted(list(x)))

# Convert the user id to int to get rid of zero, then to string
grouped_user_industry_freq_interaction.index = grouped_user_industry_freq_interaction.index.astype(int).astype(str)

grouped_user_industry_freq_interaction = grouped_user_industry_freq_interaction.apply(sorted_list_count_agg)

grouped_user_industry_freq_interaction_dict = grouped_user_industry_freq_interaction.to_dict()

# Setting these values to the values in the dictionary
for user in grouped_user_industry_freq_interaction_dict:
    if user in user_industry_freq_interaction_dict:
        user_industry_freq_interaction_dict[user] = grouped_user_industry_freq_interaction_dict[user]
        
# Converting this to a list
user_industry_freq_interaction_list = list(user_industry_freq_interaction_dict.values())

### User Industry Spend Lists

In [49]:
# Dictionary featuring all the unique customer id's
user_industry_spend_dict = {user: [] for user in customer_unique_list}

# Businesses users interacted with spend
grouped_user_id_industry_spend = tx_df.groupby(['from_totally_fake_account', 'industry_category'])['monopoly_money_amount'].sum()

# Changes index from user_id to a set of numbers, user_id (from_totally_fake_account) becomes a column
grouped_user_id_industry_spend = grouped_user_id_industry_spend.reset_index()

# Sorting 'from_totally_fake_account' and 'to_randomly_generated_account' alphabetically
# This may not change anything, but ensures order is kept
sorted_grouped_user_id_industry_spend = grouped_user_id_industry_spend.sort_values(by=['from_totally_fake_account', 'industry_category'])

# Convert the user id to int to get rid of zero, then to string
sorted_grouped_user_id_industry_spend['from_totally_fake_account'] = sorted_grouped_user_id_industry_spend['from_totally_fake_account'].astype(int).astype(str)

# Drop the 'to_randomly_generated_account' column
sorted_grouped_user_id_industry_spend = sorted_grouped_user_id_industry_spend.drop(columns=['industry_category'])

sorted_grouped_lists_user_id_industry_spend = sorted_grouped_user_id_industry_spend.groupby('from_totally_fake_account')['monopoly_money_amount'].agg(lambda x: list(x))

sorted_grouped_lists_user_id_industry_spend_dict = sorted_grouped_lists_user_id_industry_spend.to_dict()

# Setting these values to the values in the dictionary
for user in sorted_grouped_lists_user_id_industry_spend_dict:
    if user in user_industry_spend_dict:
        user_industry_spend_dict[user] = sorted_grouped_lists_user_id_industry_spend_dict[user]
        
# Converting this to a list
user_industry_spend_list = list(user_industry_spend_dict.values())

## Building the User Dataframe

In [50]:
user_slim_data = {
    'user_id': customer_unique_list,
    'sent_tx_count': user_send_tx_count_list,
    'sent_tx_sum_gbp': user_sent_tx_sum_list,
    'sent_tx_median_gbp': user_sent_tx_median_list,
    'received_tx_count': user_received_tx_count_list,
    'received_tx_sum_gbp': user_received_tx_sum_list,
    'received_tx_median_gbp': user_received_tx_median_list
    
}

user_large_data = {
    'user_id': customer_unique_list,
    'total_tx_count': user_total_tx_count_list,
    'net_tx_count': user_net_tx_count_list,
    'total_tx_sum_gbp': user_total_tx_sum_list,
    'net_tx_sum_gbp': user_net_tx_sum_list,
    'sent_tx_count': user_send_tx_count_list,
    'sent_tx_sum_gbp': user_sent_tx_sum_list,
    'sent_tx_mean_gbp': user_sent_tx_mean_list,
    'sent_tx_median_gbp': user_sent_tx_median_list,
    'received_tx_count': user_received_tx_count_list,
    'received_tx_sum_gbp': user_received_tx_sum_list,
    'received_tx_mean_gbp': user_received_tx_mean_list,
    'received_tx_median_gbp': user_received_tx_median_list,
    'user_biz_interaction': user_biz_interaction_list,
    'user_biz_interaction_frequency': user_biz_freq_interaction_list,
    'user_biz_spend': user_biz_spend_list,
    'user_user_interaction': user_user_interaction_list,
    'user_user_interaction_frequency': user_user_freq_interaction_list,
    'user_user_spend': user_user_spend_list,
    'user_industry_interaction': user_industry_interaction_list,
    'user_industry_interaction_frequency': user_industry_freq_interaction_list,
    'user_industry_spend': user_industry_spend_list
    
    
}

# Create small dataframe
user_slim_df = pd.DataFrame(user_slim_data)

# Create large dataframe
user_large_df = pd.DataFrame(user_large_data)

In [51]:
user_large_df.head(50)

Unnamed: 0,user_id,total_tx_count,net_tx_count,total_tx_sum_gbp,net_tx_sum_gbp,sent_tx_count,sent_tx_sum_gbp,sent_tx_mean_gbp,sent_tx_median_gbp,received_tx_count,...,received_tx_median_gbp,user_biz_interaction,user_biz_interaction_frequency,user_biz_spend,user_user_interaction,user_user_interaction_frequency,user_user_spend,user_industry_interaction,user_industry_interaction_frequency,user_industry_spend
0,1000,1903,1517,37720.11,19676.91,1710,28698.51,16.782754,4.5,193,...,5.5,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[116, 117, 2, 72, 5, 9, 19, 1, 35, 119, 1, 2, ...","[278.15, 269.25, 217.86, 850.5, 19.5, 347.15, ...","[16354, 18013, 30208, 31489, 36647, 38289, 411...","[55, 14, 6, 12, 1, 14, 5, 14, 1, 18, 21, 1, 27...","[315.4, 86.5, 35.0, 16130.0, 5.0, 94.0, 24.0, ...","[Alcohol, Apparel, Books, Cafes, Children, Cof...","[2, 1, 1, 813, 2, 3, 37, 1, 7, 2, 30, 21, 353,...","[104.97999999999999, 145.0, 8.0, 1955.2, 160.0..."
1,10000,1114,720,21360.39,18944.39,917,20152.39,21.976434,11.0,197,...,6.0,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[6, 1, 16, 71, 59, 3, 9, 7, 33, 2, 22, 7, 9, 2...","[13.65, 2.25, 349.77, 791.0, 1275.25, 12.5, 27...","[15896, 28620, 33124, 33732, 34123, 5277, 5439...","[39, 49, 1, 19, 10, 9, 59, 17, 29, 22, 19, 13]","[253.65, 5228.46, 50.0, 106.85, 55.2, 49.9, 32...","[Alcohol, Apparel, Books, Cafes, Coffee Beans ...","[1, 23, 34, 23, 1, 51, 2, 1, 14, 312, 119, 3, ...","[19.99, 3002.0, 370.0, 56.25, 33.64, 1685.48, ..."
2,100000,1919,1433,23455.27,20720.37,1676,22087.82,13.17889,4.5,243,...,6.0,"[ACCESSORY_SHOP, A_CAFE, A_LOCAL_COFFEE_SHOP, ...","[5, 106, 129, 16, 95, 5, 11, 1, 122, 20, 1, 5,...","[113.0, 256.2, 318.4, 1596.43, 1142.5, 23.0, 4...","[12753, 24398, 24951, 28719, 34678, 39736, 489...","[19, 14, 24, 54, 17, 21, 6, 19, 9, 30, 27, 30,...","[135.5, 83.3, 146.5, 4857.0, 109.0, 124.0, 35....","[Accessories, Alcohol, Apparel, Cafes, Coffee ...","[15, 6, 1, 828, 10, 58, 5, 2, 18, 1, 334, 65, ...","[369.0, 136.96999999999997, 37.0, 2015.95, 300..."
3,100002,1561,1233,22393.96,20303.16,1397,21348.56,15.281718,4.5,164,...,6.0,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[50, 59, 18, 155, 1, 48, 1, 4, 26, 9, 14, 58, ...","[113.35, 142.65, 1896.42, 965.5, 10.0, 212.5, ...","[10432, 14001, 3296, 37438, 37918, 46953, 5047...","[8, 1, 5, 1, 6, 10, 11, 7, 27, 2, 2, 1, 17, 4,...","[76.08, 2.45, 35.0, 1.45, 46.0, 61.5, 52.55, 4...","[Alcohol, Apparel, Books, Cafes, Children, Cof...","[7, 32, 2, 281, 2, 3, 33, 4, 2, 10, 269, 511, ...","[136.95, 3464.0, 15.0, 767.9, 99.0, 96.81, 117..."
4,100019,1157,719,44632.75,16232.85,938,30432.8,32.44435,9.0,219,...,6.0,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[50, 54, 15, 91, 1, 1, 2, 11, 1, 9, 7, 68, 9, ...","[118.95, 129.0, 1163.37, 1031.0, 60.0, 3.0, 12...","[22634, 38580, 40565, 44911, 61350, 62299, 626...","[21, 33, 18, 4, 24, 11, 28, 25, 27, 58]","[125.5, 221.0, 113.0, 24.5, 163.0, 15060.0, 16...","[Alcohol, Apparel, Books, Cafes, Children, Cof...","[1, 25, 1, 214, 4, 1, 38, 1, 12, 13, 314, 1, 6...","[12.0, 2701.0, 15.0, 514.55, 250.0, 26.36, 136..."
5,100021,140,-140,874.85,-874.85,0,0.0,0.0,0.0,140,...,0.0,[],[],[],[],[],[],[],[],[]
6,100023,166,-166,10566.5,-10566.5,0,0.0,0.0,0.0,166,...,0.0,[],[],[],[],[],[],[],[],[]
7,100027,109,-109,740.27,-740.27,0,0.0,0.0,0.0,109,...,0.0,[],[],[],[],[],[],[],[],[]
8,10004,1862,1636,18868.67,17654.27,1749,18261.47,10.441092,4.5,113,...,5.5,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[118, 114, 5, 76, 15, 1, 6, 8, 19, 1, 32, 135,...","[284.4, 270.2, 439.27000000000004, 917.5, 202....","[1079, 14243, 1757, 26534, 33117, 35257, 36109...","[3, 10, 12, 13, 2, 61, 8, 7, 8, 32, 7, 3, 36, ...","[18.0, 51.0, 60.0, 80.5, 11.0, 376.92, 52.5, 3...","[Alcohol, Apparel, Books, Cafes, Coffee Beans ...","[2, 3, 46, 869, 7, 35, 9, 2, 3, 26, 338, 2, 1,...","[31.99, 449.0, 593.0, 2116.5, 156.82, 1249.99,..."
9,100053,1652,1134,40478.88,19796.54,1393,30137.71,21.635111,6.0,259,...,6.0,"[A_CAFE, A_LOCAL_COFFEE_SHOP, A_SUPERMARKET, B...","[59, 37, 11, 65, 2, 44, 7, 22, 37, 48, 1, 1, 2...","[143.04999999999998, 91.64999999999999, 927.01...","[100274, 12782, 1476, 15938, 25098, 35916, 421...","[9, 11, 4, 15, 9, 44, 22, 15, 12, 10, 18, 21, ...","[51.5, 62.0, 25.5, 91.0, 61.5, 304.04, 149.7, ...","[Alcohol, Apparel, Books, Cafes, Coffee Beans ...","[1, 3, 1, 255, 7, 35, 6, 1, 262, 1, 357, 5, 46...","[9.0, 337.0, 15.0, 688.2, 166.35999999999999, ..."


In [52]:
# Saves the dataframes to csv files, in the specific folder
user_slim_df.to_csv('SavedData/user_slim.csv', index=False)
user_large_df.to_csv('SavedData/user_large.csv', index=False)