In [2]:
import pandas as pd
pd.options.display.max_colwidth = 200

import seaborn as sns
sns.set(rc={'figure.figsize':(15,5)})

import matplotlib.pyplot as plt
import numpy as np

In [3]:
merchant_agg = pd.read_csv("../data/curated/merchant_agg.csv")
merchant_clusters = pd.read_csv("../data/curated/merchant_clusters.csv")

display(merchant_agg.head(3))
display(merchant_clusters.head(3))

Unnamed: 0,merchant_abn,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,10023283211,2619,154.333333,33301.586223
1,10342410215,729,40.888889,15624.420327
2,10346855916,7,1.4,2186.800497


Unnamed: 0,merchant_abn,label
0,10023283211,2
1,10342410215,2
2,10346855916,0


**Merge datasets**

In [4]:
merged_feature_cluster = merchant_clusters.merge(merchant_agg, on='merchant_abn', how='inner').drop('merchant_abn', axis=1)
display(merged_feature_cluster.head(3))

Unnamed: 0,label,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,2,2619,154.333333,33301.586223
1,2,729,40.888889,15624.420327
2,0,7,1.4,2186.800497


**Perfrom log transformation**

In [5]:
agg_byCluster = pd.DataFrame(np.log(merged_feature_cluster.iloc[:,1:])).add_prefix('log_')
agg_byCluster['cluster'] = pd.Categorical(merged_feature_cluster.label)

display(agg_byCluster.head(3))
display(agg_byCluster.groupby('cluster').describe().T)

Unnamed: 0,log_total_number_of_distinct_customers,log_monthly_average_number_of_orders,log_monthly_average_bnpl_revenue,cluster
0,7.870548,5.039115,10.41336,2
1,6.591674,3.710858,9.65659,2
2,1.94591,0.336472,7.690195,0


Unnamed: 0,cluster,0,1,2
log_total_number_of_distinct_customers,count,2480.0,435.0,1465.0
log_total_number_of_distinct_customers,mean,4.254012,9.126523,7.146709
log_total_number_of_distinct_customers,std,1.487147,0.52962,0.928573
log_total_number_of_distinct_customers,min,0.0,6.79794,0.693147
log_total_number_of_distinct_customers,25%,3.295837,8.799058,6.628041
log_total_number_of_distinct_customers,50%,4.454347,9.101529,7.223296
log_total_number_of_distinct_customers,75%,5.443498,9.47838,7.862112
log_total_number_of_distinct_customers,max,6.680855,10.089137,9.808352
log_monthly_average_number_of_orders,count,2480.0,435.0,1465.0
log_monthly_average_number_of_orders,mean,1.640434,6.578398,4.299531


## Plot the Marginal Distances

In [12]:
def marginal_error(x_variable, agg_byCluster=agg_byCluster):
    '''this function creates plots of marginal distances between each cluster with respect to each feature'''
    sns.scatterplot(data=agg_byCluster, hue='cluster', style='cluster', x=x_variable, y=agg_byCluster['cluster'], palette='deep')

    # compute boundaries for clusters, outliers are removed using 1.5IQR
    description = agg_byCluster.groupby('cluster')[x_variable].describe().T
    description.loc["lb",:] = description.loc["25%"]-1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["ub",:] = description.loc["75%"]+1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["chosen_lower"] = np.where(description.loc["lb",:] < description.loc["min",:], description.loc["min",:], description.loc["lb",:])
    description.loc["chosen_upper"] = np.where(description.loc["ub",:] < description.loc["max",:], description.loc["ub",:], description.loc["max",:])
    
    # visualizing the boundaries
    colour = ['b', 'orange', 'green']
    cluster = [0,1,2]
    for i in range(0,3):
        plt.axvline(x =  description.loc["chosen_lower",cluster[i]], color = colour[i], linestyle = ':')
        plt.axvline(x =  description.loc["chosen_upper",cluster[i]], color = colour[i], linestyle = ':')
        
    # plot the marginal distances
    for i in range(0,2):
        if description.loc["chosen_lower", i+1] < description.loc["chosen_upper",i]:
            plt.axvline(x = description.loc["chosen_upper",i]-(description.loc["chosen_upper",i]-description.loc["chosen_lower", i+1])/2, 
                        color = 'r', label = f'marginal_error_{i,i+1}')
    if description.loc["chosen_lower", 2] < description.loc["chosen_upper",0]:
        plt.axvline(x = description.loc["chosen_upper",0]-(description.loc["chosen_upper",0]-description.loc["chosen_lower", 2])/2, 
                    color = 'r', label = f'marginal_error_{0,2}')
        
    plt.legend()
    plt.savefig(f'../plots/{x_variable}.jpg')
    plt.clf()
    
    return

<Figure size 1500x500 with 0 Axes>

In [None]:
for x_variable in agg_byCluster.columns[:-1]:
    marginal_error(x_variable)