# Tally of Merchant Clusters

In [62]:
import pandas as pd
pd.options.display.max_colwidth = 200

In [122]:
merchant_details = pd.read_parquet("../data/tables/tbl_merchants.parquet").reset_index('merchant_abn')
merchant_clusters = pd.read_csv("../data/curated/merchant_clusters.csv")

display(merchant_details.head(3))
display(merchant_clusters.head(3))

print('# distinct abn in merchant_clusters:', len(merchant_clusters.merchant_abn.unique()))
print('# distinct abn in merchant_details:', len(merchant_details.merchant_abn.unique()))

new_merchant = sum([1 for abn in set(merchant_clusters.merchant_abn.unique()) if abn not in set(merchant_details.merchant_abn.unique())])
print('# new merchant appears in merchant_clusters:', new_merchant)

Unnamed: 0,merchant_abn,name,tags
0,10023283211,Felis Limited,"((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))"
1,10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])"
2,10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])"


Unnamed: 0,merchant_abn,label
0,10023283211,2
1,10342410215,2
2,10346855916,0


# distinct abn in merchant_clusters: 4380
# distinct abn in merchant_details: 4026
# new merchant appears in merchant_clusters: 381


## Extract the store categories

**Clean the tags**

In [123]:
# split the product description, revenue level and take rate
merchant_details[['prod_desc','revenue_level','take_rate']] = merchant_details.tags.str.split("\],|\), ",expand=True)
# only keep the product description
merchant_details = merchant_details.drop(columns=['tags','revenue_level','take_rate', 'name'])
# clean the strings
merchant_details.prod_desc = merchant_details.prod_desc.map(lambda x: x.lstrip(' ,([').rstrip(' )]').lower().replace('  ', ' '))
display(merchant_details.head())

Unnamed: 0,merchant_abn,prod_desc
0,10023283211,"furniture, home furnishings and equipment shops, and manufacturers, except appliances"
1,10142254217,"cable, satellite, and other pay television and radio services"
2,10165489824,"jewelry, watch, clock, and silverware shops"
3,10187291046,"watch, clock, and jewelry repair shops"
4,10192359162,"music shops - musical instruments, pianos, and sheet music"


**We only have 25 distinct product descriptions**

In [124]:
display(merchant_details.describe())
display(set(merchant_details.prod_desc.values))

Unnamed: 0,merchant_abn
count,4026.0
mean,54461870000.0
std,25989390000.0
min,10023280000.0
25%,31648280000.0
50%,54329220000.0
75%,76627730000.0
max,99990540000.0


{'antique shops - sales, repairs, and restoration services',
 'art dealers and galleries',
 'artist supply and craft shops',
 'bicycle shops - sales and service',
 'books, periodicals, and newspapers',
 'cable, satellite, and other pay television and radio services',
 'computer programming , data processing, and integrated systems design services',
 'computers, computer peripheral equipment, and software',
 'digital goods: books, movies, music',
 'equipment, tool, furniture, and appliance rent al and leasing',
 'florists supplies, nursery stock, and flowers',
 'furniture, home furnishings and equipment shops, and manufacturers, except appliances',
 'gift, card, novelty, and souvenir shops',
 'health and beauty spas',
 'hobby, toy and game shops',
 'jewelry, watch, clock, and silverware shops',
 'lawn and garden supply outlets, including nurseries',
 'motor vehicle supplies and new parts',
 'music shops - musical instruments, pianos, and sheet music',
 'opticians, optical goods, and eye

## Build the Tally!

**Inner join the product description and clustered labels**

In [162]:
prod_label_merged = merchant_clusters.merge(merchant_details, on='merchant_abn', how='inner')\
    .drop(columns=['merchant_abn'])
display(prod_label_merged.head(3))

Unnamed: 0,label,prod_desc
0,2,"furniture, home furnishings and equipment shops, and manufacturers, except appliances"
1,2,"computers, computer peripheral equipment, and software"
2,0,"equipment, tool, furniture, and appliance rent al and leasing"


**Create an empty table**

In [233]:
my_tally = pd.DataFrame(index=set(merchant_details.prod_desc.unique()), columns=['cluster_' + str(cluster) for cluster in set(merchant_clusters.label.unique())]).fillna(0)
my_tally.index.name = 'product_description'
#display(my_tally.head(3))

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"florists supplies, nursery stock, and flowers",0,0,0
bicycle shops - sales and service,0,0,0
"antique shops - sales, repairs, and restoration services",0,0,0


**Fill the frequency in!**

In [234]:
for cluster in set(merchant_clusters.label.unique()):
    for prod_desc in prod_label_merged[prod_label_merged.label == cluster].prod_desc.values:
        my_tally.loc[prod_desc, 'cluster_'+str(cluster)] += 1

my_tally['sum'] = my_tally.sum(axis=1)    
display(my_tally)

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2,sum
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"florists supplies, nursery stock, and flowers",106,12,62,180
bicycle shops - sales and service,97,11,62,170
"antique shops - sales, repairs, and restoration services",85,7,21,113
tent and awning shops,74,43,61,178
"stationery, office supplies and printing and writing paper",109,3,49,161
"watch, clock, and jewelry repair shops",88,29,53,170
shoe shops,103,17,65,185
"lawn and garden supply outlets, including nurseries",85,14,54,153
"gift, card, novelty, and souvenir shops",65,47,70,182
"computer programming , data processing, and integrated systems design services",111,19,61,191


In [222]:
my_tally.sum()

cluster_0    2260
cluster_1     433
cluster_2    1306
dtype: int64

In [248]:
for i in range(3):
    display(my_tally.sort_values([f'cluster_{i}'], ascending=False)[f'cluster_{i}'].head(5))

product_description
music shops - musical instruments, pianos, and sheet music                        119
artist supply and craft shops                                                     116
computer programming , data processing, and integrated systems design services    111
stationery, office supplies and printing and writing paper                        109
florists supplies, nursery stock, and flowers                                     106
Name: cluster_0, dtype: int64

product_description
digital goods: books, movies, music                              55
gift, card, novelty, and souvenir shops                          47
cable, satellite, and other pay television and radio services    46
tent and awning shops                                            43
watch, clock, and jewelry repair shops                           29
Name: cluster_1, dtype: int64

product_description
digital goods: books, movies, music                                                      74
gift, card, novelty, and souvenir shops                                                  70
artist supply and craft shops                                                            67
cable, satellite, and other pay television and radio services                            65
furniture, home furnishings and equipment shops, and manufacturers, except appliances    65
Name: cluster_2, dtype: int64

## Take a look at the percentages

**By product descriptions**

In [249]:
byProd_tally = my_tally.iloc[:,0:3].copy()
for col in byProd_tally.columns:
    byProd_tally[col] = byProd_tally[col]/my_tally['sum']


In [250]:
byProd_tally

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"florists supplies, nursery stock, and flowers",0.588889,0.066667,0.344444
bicycle shops - sales and service,0.570588,0.064706,0.364706
"antique shops - sales, repairs, and restoration services",0.752212,0.061947,0.185841
tent and awning shops,0.41573,0.241573,0.342697
"stationery, office supplies and printing and writing paper",0.677019,0.018634,0.304348
"watch, clock, and jewelry repair shops",0.517647,0.170588,0.311765
shoe shops,0.556757,0.091892,0.351351
"lawn and garden supply outlets, including nurseries",0.555556,0.091503,0.352941
"gift, card, novelty, and souvenir shops",0.357143,0.258242,0.384615
"computer programming , data processing, and integrated systems design services",0.581152,0.099476,0.319372


In [251]:
for i in range(3):
    display(byProd_tally.sort_values([f'cluster_{i}'], ascending=False)[f'cluster_{i}'].head(5))

product_description
jewelry, watch, clock, and silverware shops                   0.901235
antique shops - sales, repairs, and restoration services      0.752212
telecom                                                       0.728000
art dealers and galleries                                     0.720721
music shops - musical instruments, pianos, and sheet music    0.712575
Name: cluster_0, dtype: float64

product_description
digital goods: books, movies, music                              0.282051
cable, satellite, and other pay television and radio services    0.262857
gift, card, novelty, and souvenir shops                          0.258242
tent and awning shops                                            0.241573
opticians, optical goods, and eyeglasses                         0.178808
Name: cluster_1, dtype: float64

product_description
gift, card, novelty, and souvenir shops                          0.384615
digital goods: books, movies, music                              0.379487
cable, satellite, and other pay television and radio services    0.371429
health and beauty spas                                           0.365854
bicycle shops - sales and service                                0.364706
Name: cluster_2, dtype: float64

**By cluster labels**

In [252]:
byCluster_tally = my_tally.iloc[:,0:3].copy()
for col in byCluster_tally.columns:
    byCluster_tally[col] = byCluster_tally[col]/my_tally[col].sum()

In [253]:
byCluster_tally

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"florists supplies, nursery stock, and flowers",0.046903,0.027714,0.047473
bicycle shops - sales and service,0.04292,0.025404,0.047473
"antique shops - sales, repairs, and restoration services",0.037611,0.016166,0.01608
tent and awning shops,0.032743,0.099307,0.046708
"stationery, office supplies and printing and writing paper",0.04823,0.006928,0.037519
"watch, clock, and jewelry repair shops",0.038938,0.066975,0.040582
shoe shops,0.045575,0.039261,0.04977
"lawn and garden supply outlets, including nurseries",0.037611,0.032333,0.041348
"gift, card, novelty, and souvenir shops",0.028761,0.108545,0.053599
"computer programming , data processing, and integrated systems design services",0.049115,0.04388,0.046708


In [254]:
for i in range(3):
    display(byCluster_tally.sort_values([f'cluster_{i}'], ascending=False)[f'cluster_{i}'].head(5))

product_description
music shops - musical instruments, pianos, and sheet music                        0.052655
artist supply and craft shops                                                     0.051327
computer programming , data processing, and integrated systems design services    0.049115
stationery, office supplies and printing and writing paper                        0.048230
florists supplies, nursery stock, and flowers                                     0.046903
Name: cluster_0, dtype: float64

product_description
digital goods: books, movies, music                              0.127021
gift, card, novelty, and souvenir shops                          0.108545
cable, satellite, and other pay television and radio services    0.106236
tent and awning shops                                            0.099307
watch, clock, and jewelry repair shops                           0.066975
Name: cluster_1, dtype: float64

product_description
digital goods: books, movies, music                                                      0.056662
gift, card, novelty, and souvenir shops                                                  0.053599
artist supply and craft shops                                                            0.051302
cable, satellite, and other pay television and radio services                            0.049770
furniture, home furnishings and equipment shops, and manufacturers, except appliances    0.049770
Name: cluster_2, dtype: float64

22/09/28 06:10:29 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1065669 ms exceeds timeout 120000 ms
22/09/28 06:10:29 WARN SparkContext: Killing executors is not supported by current scheduler.


## Save to CSV

In [178]:
my_tally.to_csv('../data/curated/tally_merchant_cluster_prod_desc.csv')