In [89]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import copy

In [90]:
customerdata_df = pd.read_csv("Wholesale customers data.csv")
customerdata_df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [91]:
customerdata_arr = customerdata_df.to_numpy() 
customerdata_arr.shape

(440, 8)

In [92]:
#np.unique(customerdata_arr[:, 0], return_counts= True)
#np.unique(customerdata_arr[:, 1], return_counts= True)

In [93]:
customer_spendingdata = customerdata_arr[:, 2:]

## Hierarchical K-Means

In [94]:
def split_data(data, K, thresh, min_data_per_cluster):
    if len(data) < min_data_per_cluster:
        return None
    km = KMeans(n_clusters=K, init='random', n_init=10, max_iter=30, tol=1e-02, random_state=0)
    y_km = km.fit_predict(data)
    c_0 = []
    c_1 = []
    for i in range(len(data)):
        if y_km[i] == 0:
            c_0.append(data[i])
        elif y_km[i] == 1:
            c_1.append(data[i])
    c_0 = np.array(c_0)
    c_1 = np.array(c_1)
    c_0_mean = np.mean(c_0, axis=0)
    c_1_mean = np.mean(c_1, axis=0)

    diff = np.linalg.norm(c_0_mean-c_1_mean)
    if diff < thresh:
        return None # Stop splitting the node if mean difference is less than threshold
    return c_0, c_1

In [95]:
def HierarchicalKMeans(data, K=2, max_depth=5, min_data_per_cluster=10, cluster_dist_thresh = 3000):
    cust_data = data
    label_dict = {} 
    c_0, c_1 = split_data(cust_data, K, cluster_dist_thresh, min_data_per_cluster)
    label_dict["0"] = c_0
    label_dict["1"] = c_1
    
    current_depth = 2  # Consider depth at root node to be 1
    
    while current_depth <= max_depth: # Breadth first splitting of nodes
        temp_dict = copy.copy(label_dict) # Shallow copy dictionary to avoid run time errors
        uniq_labels = temp_dict.keys()
        for i in uniq_labels:
            current_node_data = label_dict[i]
            val = split_data(current_node_data, K, cluster_dist_thresh, min_data_per_cluster)
            if val != None:
                child_node_0 = i + "0"
                child_node_1 = i + "1"
                label_dict.pop(i)
                label_dict[child_node_0] = val[0]
                label_dict[child_node_1] = val[1]
        print(f"At iteration {current_depth-1}, available hash codes = {label_dict.keys()}")    
        current_depth += 1
    return label_dict

In [96]:
segmented_data_dict = HierarchicalKMeans(customer_spendingdata)

At iteration 1, available hash codes = dict_keys(['00', '01', '10', '11'])
At iteration 2, available hash codes = dict_keys(['11', '000', '001', '010', '011', '100', '101'])
At iteration 3, available hash codes = dict_keys(['11', '010', '101', '0000', '0001', '0010', '0011', '0110', '0111', '1000', '1001'])
At iteration 4, available hash codes = dict_keys(['11', '010', '101', '1001', '00000', '00001', '00010', '00011', '00100', '00101', '00110', '00111', '01100', '01101', '01110', '01111', '10000', '10001'])


In [97]:
def convert_seg_data_to_df(data_dict):
    uniq_labels = data_dict.keys()
    df_list = []
    for i in uniq_labels:
        df = pd.DataFrame(data_dict[i])
        df[6] = i
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

In [98]:
customer_segmented_data_df= convert_seg_data_to_df(segmented_data_dict)
customer_segmented_data_df

Unnamed: 0,0,1,2,3,4,5,6
0,26373,36423,22019,5154,4337,16523,11
1,44466,54259,55571,7782,24171,6465,11
2,35942,38369,59598,3254,26701,2017,11
3,16117,46197,92780,1026,40827,2944,11
4,22925,73498,32114,987,20070,903,11
...,...,...,...,...,...,...,...
435,40254,640,3600,1042,436,18,10001
436,42786,286,471,1388,32,22,10001
437,39679,3944,4955,1364,523,2235,10001
438,38793,3154,2648,1034,96,1242,10001
