In [1]:
import pandas as pd
from src.clustering_help_functions import create_clustering_object, define_type_of_editing, format_clustering_modification_dictionary
from src.reading_data import read_json_as_dict
from ml_clustering.clustering import ClusterEditor,ClusteringUtils,Clustering, AutoClustering

In [2]:
import os
os.getcwd()

'/Users/kleomeniskogias/DataspellProjects/ML_Delivery_POCs'

In [3]:
# Configurations
data_file = "data/sample_data.csv"
clustering_modifications_file = "config/modifications.json"

In [4]:
# Read inputs
input_df = pd.read_csv(data_file, index_col=False)
print(input_df.shape)
clustering_modifications = read_json_as_dict(clustering_modifications_file)

(1000, 27)


In [5]:
# Check how the distribution of the clusters look like
(input_df['global_subcluster_number'].value_counts()
 .reset_index().
 rename(columns = {'index' :  'subcluster_number', 'global_subcluster_number': 'Count'})
 .sort_values('subcluster_number'))

Unnamed: 0,subcluster_number,Count
16,0,17
12,1,40
18,4,13
19,5,11
14,6,19
2,7,86
4,8,65
7,9,60
5,11,64
6,12,62


In [6]:
# Create clustering object
clustering = create_clustering_object(input_df, 'global_subcluster_number', ["needs"], 'submission_id')
clustering.cluster_counts(percentage=True)
editor=ClusterEditor(clustering)

In [7]:
# Create clustering names dict
clusters_df = input_df[['global_subcluster_number', 'global_subcluster_name']].sort_values('global_subcluster_number')
cluster_names_dict = dict(zip(clusters_df['global_subcluster_number'], clusters_df['global_subcluster_name']))

In the code below we are checking 4 different examples using the configuration file for clustering modifications
Example 1: Modifications using multiple fields
Example 2: Move to Noise
Example 3 : Rename Cluster
Example 4 : Create a new cluster

In [8]:
# Run clustering modifications
for i, modification in enumerate(clustering_modifications):
    print(f"--------------- Modification {i} --------------------")
    modification, cluster_names_dict= define_type_of_editing(modification, cluster_names_dict,clustering) # Type of editing
    d = format_clustering_modification_dictionary(modification)   # Modify the format of the dictionary
    # Apply modification
    if modification['from_cluster'] is not None:
        # print(f"editor.select(selections={d},from_cluster={modification['from_cluster']},to_cluster={modification['to_cluster']})")
        print(editor.select(selections=d,from_cluster=modification['from_cluster'],to_cluster=modification['to_cluster']))
        editor.reassign()

--------------- Modification 0 --------------------
Filter has been applied to this modification
{'total': 57, 'staged_for_reassignment': 26}
--------------- Modification 1 --------------------
No filter has been applied to this modification...
Empty dictionary
{'total': 13, 'staged_for_reassignment': 13}
--------------- Modification 2 --------------------
Creating new cluster...
Filter has been applied to this modification
{'total': 64, 'staged_for_reassignment': 37}
--------------- Modification 3 --------------------
No filter has been applied to this modification...
Empty dictionary
{'total': 11, 'staged_for_reassignment': 11}


In [9]:
clustering.cluster_counts(percentage=True)

[[0, 0.03],
 [1, 0.04],
 [2, 0.0],
 [3, 0.0],
 [4, 0.0],
 [5, 0.011],
 [6, 0.019],
 [7, 0.086],
 [8, 0.065],
 [9, 0.06],
 [10, 0.0],
 [11, 0.027],
 [12, 0.062],
 [13, 0.009],
 [14, 0.059],
 [15, 0.009],
 [16, 0.009],
 [17, 0.073],
 [18, 0.008],
 [19, 0.0],
 [20, 0.087],
 [21, 0.093],
 [22, 0.0],
 [23, 0.0],
 [24, 0.0],
 [25, 0.044],
 [26, 0.018],
 [27, 0.023],
 [28, 0.0],
 [29, 0.0],
 [30, 0.031],
 [31, 0.0],
 [32, 0.0],
 [33, 0.0],
 [34, 0.0],
 [35, 0.0],
 [36, 0.0],
 [37, 0.085],
 [38, 0.002],
 [39, 0.013],
 [40, 0.037]]

In [10]:
cluster_names_dict

{0: 'NOISE',
 1: 'Reward',
 4: 'Jolly Time',
 5: 'Renamed Cluster 5',
 6: 'Enhance',
 7: 'Perk Up',
 8: 'Daily Enabler',
 9: 'Easy Fuel',
 11: 'Hydrate',
 12: 'Nutrition Rich',
 13: 'Supplement',
 14: 'Enhance',
 15: 'Replenish',
 16: 'Revive',
 17: 'Thirst Quench',
 18: 'Cool Down',
 20: 'Relish',
 21: 'Enjoy',
 25: 'Comfort',
 26: 'Explore',
 27: 'Happy Boost',
 30: 'Functional Health',
 37: 'Sure & Easy',
 38: 'Sweet delight',
 39: 'Revive',
 40: 'Created new cluster'}