In [8]:
import pandas as pd
pd.options.display.max_colwidth = 200

import seaborn as sns
sns.set(rc={'figure.figsize':(15,5)})

import matplotlib.pyplot as plt
import numpy as np

In [9]:
merchant_agg = pd.read_csv("../data/curated/clusters/input/agg_transaction_pred.csv")[['merchant_abn', 'total_number_of_distinct_customers', 
    'monthly_average_number_of_orders', 'monthly_average_bnpl_revenue']]
merchant_clusters = pd.read_csv("../data/curated/clusters/output/merchant_clusters.csv")

display(merchant_agg.head(3))
display(merchant_clusters.head(3))

Unnamed: 0,merchant_abn,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,24406529929,3307,198.555556,13346.131338
1,28767881738,3,1.0,4448.732784
2,56395390867,40,2.666667,2543.462871


Unnamed: 0,merchant_abn,label
0,10023283211,2
1,10342410215,2
2,10346855916,0


**Merge datasets**

In [10]:
merged_feature_cluster = merchant_clusters.merge(merchant_agg, on='merchant_abn', how='inner').drop('merchant_abn', axis=1)
display(merged_feature_cluster.head(3))

Unnamed: 0,label,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,2,3307,198.555556,13346.131338
1,0,3,1.0,4448.732784
2,0,40,2.666667,2543.462871


**Perfrom log transformation**

In [11]:
agg_byCluster = pd.DataFrame(np.log(merged_feature_cluster.iloc[:,1:])).add_prefix('log_')
agg_byCluster['cluster'] = pd.Categorical(merged_feature_cluster.label)

display(agg_byCluster.head(3))
display(agg_byCluster.groupby('cluster').describe().T)

Unnamed: 0,log_total_number_of_distinct_customers,log_monthly_average_number_of_orders,log_monthly_average_bnpl_revenue,cluster
0,8.103797,5.291069,9.498982,2
1,1.098612,0.0,8.400375,0
2,3.688879,0.980829,7.841282,0


Unnamed: 0,cluster,0,1,2
log_total_number_of_distinct_customers,count,220.0,2.0,159.0
log_total_number_of_distinct_customers,mean,2.738297,10.005054,6.552171
log_total_number_of_distinct_customers,std,1.239096,0.097311,1.196086
log_total_number_of_distinct_customers,min,0.0,9.936245,4.812184
log_total_number_of_distinct_customers,25%,1.94591,9.97065,5.703777
log_total_number_of_distinct_customers,50%,2.890372,10.005054,6.194405
log_total_number_of_distinct_customers,75%,3.669891,10.039459,7.283634
log_total_number_of_distinct_customers,max,4.787492,10.073863,9.808352
log_monthly_average_number_of_orders,count,220.0,2.0,159.0
log_monthly_average_number_of_orders,mean,0.594833,8.241982,3.704778


## Plot the Marginal Distances

In [12]:
def marginal_error(x_variable, agg_byCluster=agg_byCluster):
    '''this function creates plots of marginal distances between each cluster with respect to each feature'''
    sns.scatterplot(data=agg_byCluster, hue='cluster', style='cluster', x=x_variable, y=agg_byCluster['cluster'], palette='deep')

    # compute boundaries for clusters, outliers are removed using 1.5IQR
    description = agg_byCluster.groupby('cluster')[x_variable].describe().T
    description.loc["lb",:] = description.loc["25%"]-1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["ub",:] = description.loc["75%"]+1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["chosen_lower"] = np.where(description.loc["lb",:] < description.loc["min",:], description.loc["min",:], description.loc["lb",:])
    description.loc["chosen_upper"] = np.where(description.loc["ub",:] < description.loc["max",:], description.loc["ub",:], description.loc["max",:])
    
    # visualizing the boundaries
    colour = ['b', 'orange', 'green']
    cluster = [0,1,2]
    for i in range(0,3):
        plt.axvline(x =  description.loc["chosen_lower",cluster[i]], color = colour[i], linestyle = ':')
        plt.axvline(x =  description.loc["chosen_upper",cluster[i]], color = colour[i], linestyle = ':')
        
    # plot the marginal distances
    for i in range(0,2):
        if description.loc["chosen_lower", i+1] < description.loc["chosen_upper",i]:
            plt.axvline(x = description.loc["chosen_upper",i]-(description.loc["chosen_upper",i]-description.loc["chosen_lower", i+1])/2, 
                        color = 'r', label = f'marginal_error_{i,i+1}')
    if description.loc["chosen_lower", 2] < description.loc["chosen_upper",0]:
        plt.axvline(x = description.loc["chosen_upper",0]-(description.loc["chosen_upper",0]-description.loc["chosen_lower", 2])/2, 
                    color = 'r', label = f'marginal_error_{0,2}')
        
    plt.legend()
    plt.savefig(f'../plots/{x_variable}.jpg')
    plt.clf()
    
    return

In [13]:
for x_variable in agg_byCluster.columns[:-1]:
    marginal_error(x_variable)

<Figure size 1080x360 with 0 Axes>