# 1. Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("./data/fake_transactional_data_24.csv")

In [4]:
data.rename(columns={
    'from_totally_fake_account': 'from_account',
    'monopoly_money_amount': 'money_amount',
    'to_randomly_generated_account': 'to_account',
    'not_happened_yet_date': 'date'
}, inplace=True)
display(data.head())

Unnamed: 0,from_account,money_amount,to_account,date
0,10371.0,4.0,CINEMA,01/01/2025
1,88339.0,2.4,40544,01/01/2025
2,18555.0,2.4,85149,01/01/2025
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025
4,80792.0,1.95,18555,01/01/2025


In [5]:
from datetime import datetime

In [6]:
data["from_account"] = data["from_account"].astype(int)
data['to_account'] = data['to_account'].apply(lambda x: "PERSONAL_TRANSFER" if x.isdigit() else x)
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))

In [7]:
consumption = data.loc[:, ['from_account','money_amount', 'to_account', 'date']]
consumption = consumption[consumption["to_account"] != "PERSONAL_TRANSFER"].reset_index(drop=True)

In [8]:
type_set = set()
for item in consumption['to_account'].unique():
    type_set.add(item)

In [9]:
coffee_set = set()
for item in type_set:
    if "COFFEE" in item or "CAFE" in item:
        coffee_set.add(item)

In [10]:
bar_set = set()
bar_keywords = {"BAR", "PUB",'LOCAL_WATERING_HOLE'}
for item in type_set:
    for keywords in bar_keywords:
        if keywords in item:
            bar_set.add(item)

In [11]:
restaurant_set = set()
restaurant_keywords = {"RESTAURANT","KEBAB","SANDWICH","LUNCH","TAKEAWAY"}
for item in type_set:
    for keywords in restaurant_keywords:
        if keywords in item:
            restaurant_set.add(item)
restaurant_set = restaurant_set.union({'TURKEY_FARM','TO_BEAN_OR_NOT_TO_BEAN','STEAK_HOUSE','SEAFOOD_RESAURANT'})

In [12]:
food_retail_set = set()
for item in type_set:
    if "BUTCHER" in item:
        food_retail_set.add(item)
food_retail_set = food_retail_set.union({'LIQUOR_STORE','WINE_CELLAR','WHISKEY_SHOP','ROASTERIE',
                                         'GREENGROCER','TEA_SHOP','WE_HAVE_BEAN_WEIGHTING'})

In [13]:
toy_game_set = {"TOY_SHOP", "VIDEO_GAME_STORE", "CHILDRENDS_SHOP", "GAME_SHOP"}
clothing_set = {
    "FASHION_SHOP", "FASHIONABLE_SPORTSWARE_SHOP", "CLOTHES_SHOP", 
    "TRAINER_SHOP", "JEWLLERY_SHOP", "ACCESSORY_SHOP", "KIDS_CLOTHING_SHOP"
}
book_set = {
    "COMIC_BOOK_SHOP", "BOOKSHOP", "NERDY_BOOK_STORE", "LOCAL_BOOKSHOP", "SECOND_HAND_BOOKSHOP"
}
pet_set = {"PET_TOY_SHOP", "PET_SHOP"}
movie_TV_set = {"CINEMA",'STREAMING_SERVICE',"DVD_SHOP"}
electronic_set = {'ELECTRONICS_SHOP','HIPSTER_ELECTRONICS_SHOP','TECH_SHOP'}
sport_set = {'SPORT_SHOP', 'GYM','RUNNING_SHOP'}
supermarket_set = {"A_SUPERMARKET", "EXPRESS_SUPERMARKET", "LARGE_SUPERMARKET", "THE_SUPERMARKET", 'DEPARTMENT_STORE'}

non_other_set = coffee_set|bar_set|food_retail_set|toy_game_set|clothing_set|pet_set|movie_TV_set|electronic_set|sport_set|supermarket_set
other_set = {item for item in type_set if item not in non_other_set}

In [14]:
map_I_dict = dict()
for item in type_set:
    if item in coffee_set:
        map_I_dict[item] = "Coffee Shop"
    elif item in bar_set:
        map_I_dict[item] = "Bar"
    elif item in restaurant_set:
        map_I_dict[item] = "Restaurant"
    elif item in food_retail_set:
        map_I_dict[item] = "Food Retail"
    elif item in toy_game_set:
        map_I_dict[item] = "Toy and Game"
    elif item in clothing_set:
        map_I_dict[item] = "Clothing Store"
    elif item in book_set:
        map_I_dict[item] = "Bookstore"
    elif item in pet_set:
        map_I_dict[item] = "Pet"
    elif item in movie_TV_set:
        map_I_dict[item] = "Movie and TV"
    elif item in electronic_set:
        map_I_dict[item] = "Electronic Product"
    elif item in sport_set:
        map_I_dict[item] = "Sport"
    elif item in supermarket_set:
        map_I_dict[item] = "Supermarket"
    elif item in other_set:
        map_I_dict[item] = "Other"    
consumption["sub_category"] = consumption["to_account"].apply(lambda x: map_I_dict[x])

In [15]:
map_II_dict = {
    "Coffee Shop": "Food and Beverage",
    "Bar":"Food and Beverage",
    "Restaurant": "Food and Beverage",
    "Food Retail": "Food and Beverage",
    "Movie and TV": "Entertainment",
    "Bookstore": "Entertainment",
    "Toy and Game": "Entertainment",
    "Clothing Store": "Clothing",
    "Electronic Product": "Electronics",
    "Supermarket": "Supermarket",
    "Sport": "Sport",
    "Pet": "Pet",
    "Other": "Other"
}
consumption["category"] = consumption["sub_category"].apply(lambda x: map_II_dict[x])

In [None]:
consumption['weekday'] = consumption['date'].apply(lambda x: x.strftime("%A"))
consumption['week_number'] = consumption['date'].apply(lambda x: int(x.strftime("%U")) + 1)
consumption['month'] = consumption['date'].apply(lambda x: x.strftime("%B"))

In [None]:
# consumption.rename(columns={
#     "sub_category": "sub_category",
#     "category": "category"
# },inplace=True)
# display(consumption.head())

In [None]:
consumption.to_csv('./data/consumption.csv', index=False)

# 2. Analysis

## 2.1 Analysis on Individuals

In [None]:
consumption = pd.read_csv('./data/consumption.csv')

In [None]:
inidividual_mean_consumption = consumption.pivot_table(index='from_account', columns='sub_category', values='money_amount', aggfunc='mean', fill_value=0)
display(inidividual_mean_consumption)

In [None]:
inidividual_sum_consumption = consumption.pivot_table(index='from_account', columns='sub_category', values='money_amount', aggfunc='sum', fill_value=0)
display(inidividual_sum_consumption)

## 2.2 Analysis on Weekdays

In [None]:
consumption = pd.read_csv('./data/consumption.csv')
consumption = consumption.drop(['from_account'],axis=1)

In [None]:
consumption_weekday = consumption.loc[:,['money_amount','to_account','sub_category','category','weekday']]
consumption_weekday['weekday'] = consumption_weekday['weekday'].astype('category')
reorder_list = ['Sunday', 'Monday', 'Tuesday','Wednesday','Thursday', 'Friday', 'Saturday']
consumption_weekday['weekday'] = consumption_weekday['weekday'].cat.reorder_categories(reorder_list)

In [None]:
consumption_weekday.info()

In [None]:
import datetime

In [None]:
start_date = datetime.date(2025, 1, 1)
end_date = datetime.date(2025, 12, 31)


day_count = {'Sunday': 0, 'Monday': 0, 'Tuesday': 0, 'Wednesday': 0, 'Thursday': 0, 'Friday': 0, 'Saturday': 0}

current_date = start_date
while current_date <= end_date:
    day_of_week = current_date.strftime("%A")
    day_count[day_of_week] += 1
    current_date += datetime.timedelta(days=1)

In [None]:
consumption_weekday_category = consumption_weekday.groupby(['weekday', 'category'])['money_amount'].agg(['sum', 'mean', 'std','max', 'min', 'count']).sort_values(by='weekday')
consumption_weekday_category = consumption_weekday_category.reset_index()
consumption_weekday_category['weekday_count'] = consumption_weekday_category['weekday'].apply(lambda x: day_count[x])
consumption_weekday_category['average_transaction_volume'] = consumption_weekday_category['count']/consumption_weekday_category['weekday_count']
consumption_weekday_category['average_transaction_volume'] = consumption_weekday_category['average_transaction_volume'].astype(int)
consumption_weekday_category.drop(['weekday_count'],axis=1,inplace=True)
consumption_weekday_category.rename(columns={
    'sum':'total_transaction_amount',
    'mean':'average_transaction_amount',
    'std':'std_transaction_amount',
    'max':'max_transaction_amount',
    'min':'min_transaction_amount',
    'count':'total_transaction_volume'
},inplace=True)
consumption_weekday_category = consumption_weekday_category.set_index(['weekday','category'])
consumption_weekday_category = consumption_weekday_category.sort_index()
display(consumption_weekday_category)

In [None]:
titles = ['total_transaction_amount','average_transaction_amount','total_transaction_volume', 'average_transaction_volume']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 18))
index = 0
for i in range(2):
    for j in range(2):
        sns.barplot(data=consumption_weekday_category.reset_index(), x='weekday', y=titles[index], hue='category',ax=axes[i,j])
        title = titles[index].replace('_', ' ').title()
        axes[i,j].set_title(title)
        axes[i,j].set_xlabel('Weekday')
        index += 1

plt.tight_layout()
plt.savefig('./figs/weekdays.png')
plt.show()

According to the diagrams above, the `total transaction amount`, the `total transaction volume` and the `average transaction volume` of `Food and Beverage` is the highest on each weekday, and the consumption is concentrated on `Fridays`, `Saturdays`, and `Sundays`. However, `Electronics` has the highest `average_transaction_amount`, followed by `Clothing` and `Supermarket`.

### 2.2.1 Further Analysis on `Food and Beverage`

In [None]:
start_date = datetime.date(2025, 1, 1)
end_date = datetime.date(2025, 12, 31)


day_count = {'Sunday': 0, 'Monday': 0, 'Tuesday': 0, 'Wednesday': 0, 'Thursday': 0, 'Friday': 0, 'Saturday': 0}

current_date = start_date
while current_date <= end_date:
    day_of_week = current_date.strftime("%A")
    day_count[day_of_week] += 1
    current_date += datetime.timedelta(days=1)

In [None]:
consumption_food_weekday = consumption_weekday[consumption_weekday['category']=='Food and Beverage']
consumption_food_weekday = consumption_food_weekday.groupby(['weekday', 'sub_category'])['money_amount'].agg(['sum', 'mean', 'std','max', 'min', 'count']).sort_values(by='weekday')
consumption_food_weekday = consumption_food_weekday.reset_index()
consumption_food_weekday['weekday_count'] = consumption_food_weekday['weekday'].apply(lambda x: day_count[x])
consumption_food_weekday['average_transaction_volume'] = consumption_food_weekday['count']/consumption_food_weekday['weekday_count']
consumption_food_weekday['average_transaction_volume'] = consumption_food_weekday['average_transaction_volume'].astype(int)
consumption_food_weekday.drop(['weekday_count'],axis=1,inplace=True)
consumption_food_weekday.rename(columns={
    'sum':'total_transaction_amount',
    'mean':'average_transaction_amount',
    'std':'std_transaction_amount',
    'max':'max_transaction_amount',
    'min':'min_transaction_amount',
    'count':'total_transaction_volume'
},inplace=True)
consumption_food_weekday = consumption_food_weekday.set_index(['weekday','sub_category'])
consumption_food_weekday = consumption_food_weekday.sort_index()
display(consumption_food_weekday)

In [None]:
titles = ['total_transaction_amount','average_transaction_amount','total_transaction_volume', 'average_transaction_volume']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 18))
index = 0
for i in range(2):
    for j in range(2):
        sns.barplot(data=consumption_food_weekday.reset_index(), x='weekday', y=titles[index], hue='sub_category',ax=axes[i,j])
        title = titles[index].replace('_', ' ').title()
        axes[i,j].set_title(title)
        axes[i,j].set_xlabel('Weekday')
        index += 1

plt.tight_layout()
plt.savefig('./figs/foodweekdays.png')
plt.show()


According to the digrams above, the consumption of `Bar` is concentrated on and achieved highest `Total Transction Amount` on `Fridays`, `Saturdays`, and `Sundays`. However, `Food Retail` has the highest `Average Transction Amount` on each weekday.

## 2.3 Analysis on Months

In [None]:
consumption_month = consumption.loc[:,['money_amount','to_account','sub_category','category','month']]
consumption_month['month'] = consumption_month['month'].apply(lambda x: x[:3])
consumption_month['month'] = consumption_month['month'].astype('category')
reorder_list = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
consumption_month['month'] = consumption_month['month'].cat.reorder_categories(reorder_list)

In [None]:
day_count = {
    'Jan':31,
    'Feb':28,
    'Mar': 31,
    'Apr': 30,
    'May': 31,
    'Jun': 30,
    'Jul': 31,
    'Aug': 31,
    'Sep': 30,
    'Oct': 31,
    'Nov': 30,
    'Dec':31
}

In [None]:
consumption_month_category = consumption_month.groupby(['month', 'category'])['money_amount'].agg(['sum', 'mean', 'std','max', 'min', 'count']).sort_values(by='month')
consumption_month_category = consumption_month_category.reset_index()
consumption_month_category['month_count'] = consumption_month_category['month'].apply(lambda x: day_count[x])
consumption_month_category['average_transaction_volume'] = consumption_month_category['count']/consumption_month_category['month_count']
consumption_month_category['average_transaction_volume'] = consumption_month_category['average_transaction_volume'].astype(int)
consumption_month_category.drop(['month_count'],axis=1,inplace=True)
consumption_month_category.rename(columns={
    'sum':'total_transaction_amount',
    'mean':'average_transaction_amount',
    'std':'std_transaction_amount',
    'max':'max_transaction_amount',
    'min':'min_transaction_amount',
    'count':'total_transaction_volume'
},inplace=True)
consumption_month_category = consumption_month_category.set_index(['month','category'])
consumption_month_category = consumption_month_category.sort_index()
display(consumption_month_category)

In [None]:
titles = ['total_transaction_amount','average_transaction_amount','total_transaction_volume', 'average_transaction_volume']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 18))
index = 0
for i in range(2):
    for j in range(2):
        sns.lineplot(data=consumption_month_category.reset_index(), x='month', y=titles[index], hue='category',ax=axes[i,j])
        title = titles[index].replace('_', ' ').title()
        axes[i,j].set_title(title)
        axes[i,j].set_xlabel('Month')
        axes[i,j].set_ylabel('Value')
        index += 1

plt.savefig('./figs/month.png')
plt.tight_layout()
plt.show()

According to the line charts above, expect `Food and Beverage`, the remaining types don't change sharply. And in Decembers, almost all experience an increase.