# Tally of Merchant Clusters

In [62]:
import pandas as pd
pd.options.display.max_colwidth = 200

In [122]:
merchant_details = pd.read_parquet("../data/tables/tbl_merchants.parquet").reset_index('merchant_abn')
merchant_clusters = pd.read_csv("../data/curated/merchant_clusters.csv")

display(merchant_details.head(3))
display(merchant_clusters.head(3))

print('# distinct abn in merchant_clusters:', len(merchant_clusters.merchant_abn.unique()))
print('# distinct abn in merchant_details:', len(merchant_details.merchant_abn.unique()))

new_merchant = sum([1 for abn in set(merchant_clusters.merchant_abn.unique()) if abn not in set(merchant_details.merchant_abn.unique())])
print('# new merchant appears in merchant_clusters:', new_merchant)

Unnamed: 0,merchant_abn,name,tags
0,10023283211,Felis Limited,"((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))"
1,10142254217,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])"
2,10165489824,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])"


Unnamed: 0,merchant_abn,label
0,10023283211,2
1,10342410215,2
2,10346855916,0


# distinct abn in merchant_clusters: 4380
# distinct abn in merchant_details: 4026
# new merchant appears in merchant_clusters: 381


## Extract the store categories

**Clean the tags**

In [123]:
# split the product description, revenue level and take rate
merchant_details[['prod_desc','revenue_level','take_rate']] = merchant_details.tags.str.split("\],|\), ",expand=True)
# only keep the product description
merchant_details = merchant_details.drop(columns=['tags','revenue_level','take_rate', 'name'])

# clean the strings
merchant_details.prod_desc = merchant_details.prod_desc.map(lambda x: x.lstrip(' ,([').rstrip(' )]').lower().replace('  ', ' '))
display(merchant_details.head())

Unnamed: 0,merchant_abn,prod_desc
0,10023283211,"furniture, home furnishings and equipment shops, and manufacturers, except appliances"
1,10142254217,"cable, satellite, and other pay television and radio services"
2,10165489824,"jewelry, watch, clock, and silverware shops"
3,10187291046,"watch, clock, and jewelry repair shops"
4,10192359162,"music shops - musical instruments, pianos, and sheet music"


**We only have 25 distinct product descriptions**

In [124]:
display(merchant_details.describe())
display(set(merchant_details.prod_desc.values))

Unnamed: 0,merchant_abn
count,4026.0
mean,54461870000.0
std,25989390000.0
min,10023280000.0
25%,31648280000.0
50%,54329220000.0
75%,76627730000.0
max,99990540000.0


{'antique shops - sales, repairs, and restoration services',
 'art dealers and galleries',
 'artist supply and craft shops',
 'bicycle shops - sales and service',
 'books, periodicals, and newspapers',
 'cable, satellite, and other pay television and radio services',
 'computer programming , data processing, and integrated systems design services',
 'computers, computer peripheral equipment, and software',
 'digital goods: books, movies, music',
 'equipment, tool, furniture, and appliance rent al and leasing',
 'florists supplies, nursery stock, and flowers',
 'furniture, home furnishings and equipment shops, and manufacturers, except appliances',
 'gift, card, novelty, and souvenir shops',
 'health and beauty spas',
 'hobby, toy and game shops',
 'jewelry, watch, clock, and silverware shops',
 'lawn and garden supply outlets, including nurseries',
 'motor vehicle supplies and new parts',
 'music shops - musical instruments, pianos, and sheet music',
 'opticians, optical goods, and eye

## Build the Tally!

**Inner join the product description and clustered labels**

In [162]:
prod_label_merged = merchant_clusters.merge(merchant_details, on='merchant_abn', how='inner')\
    .drop(columns=['merchant_abn'])
display(prod_label_merged.head(3))

Unnamed: 0,label,prod_desc
0,2,"furniture, home furnishings and equipment shops, and manufacturers, except appliances"
1,2,"computers, computer peripheral equipment, and software"
2,0,"equipment, tool, furniture, and appliance rent al and leasing"


**Create an empty table**

In [174]:
my_tally = pd.DataFrame(index=set(merchant_details.prod_desc.unique()), columns=['cluster_' + str(cluster) for cluster in set(merchant_clusters.label.unique())]).fillna(0)
my_tally.index.name = 'product_description'
display(my_tally.head(3))

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"florists supplies, nursery stock, and flowers",0,0,0
bicycle shops - sales and service,0,0,0
"antique shops - sales, repairs, and restoration services",0,0,0


**Fill the frequency in!**

In [175]:
for cluster in set(merchant_clusters.label.unique()):
    for prod_desc in prod_label_merged[prod_label_merged.label == cluster].prod_desc.values:
        my_tally.loc[prod_desc, 'cluster_'+str(cluster)] += 1
        
display(my_tally)

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2
product_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"florists supplies, nursery stock, and flowers",106,12,62
bicycle shops - sales and service,97,11,62
"antique shops - sales, repairs, and restoration services",85,7,21
tent and awning shops,74,43,61
"stationery, office supplies and printing and writing paper",109,3,49
"watch, clock, and jewelry repair shops",88,29,53
shoe shops,103,17,65
"lawn and garden supply outlets, including nurseries",85,14,54
"gift, card, novelty, and souvenir shops",65,47,70
"computer programming , data processing, and integrated systems design services",111,19,61


## Save to CSV

In [178]:
my_tally.to_csv('../data/curated/tally_merchant_cluster_prod_desc.csv')