# Combine take rate and imputed take rate into one file

Purpose:
- take cluster output as input and return full take rate for all merchants

In [2]:
import pandas as pd

In [3]:
# read in data
clusters = pd.read_csv('../data/curated/clusters/output/merchant_clusters.csv')
cluster_take_rate = pd.read_csv('../data/curated/clusters/output/take_rate.csv')
merchant_data = pd.read_parquet('../data/tables/tbl_merchants.parquet')

In [4]:
# extract the real take rates from companies that do not need imputation
tags = list(merchant_data['tags'])
take_rate = [float(tag[-6:-2]) for tag in tags]
real_take_rate = pd.DataFrame({"merchant_abn":merchant_data.index, "take_rate":take_rate})
real_take_rate

Unnamed: 0,merchant_abn,take_rate
0,10023283211,0.18
1,10142254217,4.22
2,10165489824,4.40
3,10187291046,3.29
4,10192359162,6.33
...,...,...
4021,99938978285,4.50
4022,99974311662,3.17
4023,99976658299,6.57
4024,99987905597,6.82


In [5]:
# get clusters and their imputed take rate
cluster_with_take_rate = clusters.merge(cluster_take_rate, on = ['label'], how = 'inner')
cluster_with_take_rate = cluster_with_take_rate.rename(columns={'take_rate':'imputed_take_rate'})

In [6]:
# merge imputed and real take rate (which also has merchant ABN)
merchant_take_rate_full = cluster_with_take_rate.merge(real_take_rate, on='merchant_abn', how='left')

In [7]:
merchant_take_rate_full

Unnamed: 0,merchant_abn,label,imputed_take_rate,take_rate
0,10023283211,2,4.380858,0.18
1,10342410215,2,4.380858,6.34
2,10714068705,2,4.380858,2.51
3,14148282104,2,4.380858,6.12
4,15115332331,2,4.380858,5.64
...,...,...,...,...
4375,82065156333,1,4.559353,5.88
4376,88393306198,1,4.559353,3.68
4377,96566672398,1,4.559353,2.05
4378,32234779638,1,4.559353,


In [8]:
# combine the two rates columns using dummy variable trick
merchant_take_rate_full['dummy'] = merchant_take_rate_full['take_rate'].apply(lambda x:1 if x>=0 else 0)
merchant_take_rate_full = merchant_take_rate_full.fillna(0)
merchant_take_rate_full['take_rate'] = merchant_take_rate_full['take_rate'] * merchant_take_rate_full['dummy'] + merchant_take_rate_full['imputed_take_rate']

In [9]:
# then once combined, drop the useless rows
merchant_take_rate_full = merchant_take_rate_full.drop(['dummy', 'imputed_take_rate'], axis=1)

In [10]:
merchant_take_rate_full

Unnamed: 0,merchant_abn,label,take_rate
0,10023283211,2,4.560858
1,10342410215,2,10.720858
2,10714068705,2,6.890858
3,14148282104,2,10.500858
4,15115332331,2,10.020858
...,...,...,...
4375,82065156333,1,10.439353
4376,88393306198,1,8.239353
4377,96566672398,1,6.609353
4378,32234779638,1,4.559353


In [11]:
# export file
merchant_take_rate_full.to_csv('../data/curated/clusters/output/merchant_take_rate_full.csv', index=False)