In [2]:
import numpy as np
import random
import json
from os import path as osp
import os
import yaml

In [3]:
import sys
root = osp.abspath(osp.join(os.getcwd(), '../'))
# print(root)
sys.path.append(root)
from src.hierarchicalcluster import *
from src.hitl import HITL

# definition

1. config parameters:
    - max_k - max number of events for sequence
    - remove_ids - list of ignored events
    - event_weights - weight list of events, set to be uniform by default
--------------
2. HITL interface: it receives commands from user, modifies cfg and cluster result, saves new cfg and result at exit.
    - result correction
      - merge two clusters
      - move sample from one cluster to another
      - add new cluster
    - cfg param setting
      - add remove_id
      - remove id from remove_id list
      - set max_k value
      - ~~set event weights~~
    - others
      - help
      - quit
      - check clusters, samples, print all
  
--------------
3. Overall workflow:
   1. training:
      1. it loads unlabeled data, use default setting to learn model param and get result;
      2. it interacts with user to correct these results (give labels) and modify setting;
      3. it then update its hyper-params to fit the labels.
      4. iterate 2-3 steps for further correction.
   2. prediction: receive unlabeled data, predict its cluster id.

In [5]:
# cfg = {
#     'max_K': 10000,
#     'remove_ids': [],
#     'event_weights': [],
# }
path_setting = osp.join(os.getcwd(), 'setting.yml')
# with open(path_setting, 'w') as fp:
#     yaml.dump(cfg, fp, default_flow_style=False)
with open(path_setting, 'r') as fp:
    cfg = yaml.safe_load(fp)
cfg

{'event_weights': [], 'max_K': 10000, 'remove_ids': []}

In [6]:
xs, ys = get_data(path=osp.join(os.getcwd(), 'data_ul.json'))
rev_xs = prep_data(xs, cfg['remove_ids'], cfg['max_K'])
clusters_true = get_cluster(ys)

print(f"True clusters: {clusters_true}")
print(ys)

print(f"Original data: {xs[1]}")
print(f"Processed data: {rev_xs[1]}")

True clusters: [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Original data: ['9', '10', '4', '11', '5', '11', '6', '7', '11', '11']
Processed data: ['11', '11', '7', '6', '11', '5', '11', '4', '10', '9']


In [7]:
c = 1
o = 1
d_threshold = 0.3

hc = HierCluster(distance_threshold=d_threshold, c=c, o=o)
clusters_pred = hc.run(rev_xs)

print(clusters_pred)

[[0, 2, 3], [1, 7, 4], [5, 6], [8, 9]]


# Use interactive cli to provide feedback to model

use following cmds to get true labeled data
- moves 1 1 0
- y
- moves 7 1 3
- n
- moves 4 1 2
- y
- addrs 11
- q

In [8]:
save_cfg_path = osp.join(os.getcwd(), 'setting_new.yml')
save_res_path = path_setting = osp.join(os.getcwd(), 'data_l.json')
dm = HITL(cfg, clusters_pred, xs, save_cfg_path=save_cfg_path, save_res_path=save_res_path)
dm.run()

******************** Human Inspection on HC_result Started ********************
New clusters: [[0, 2, 3], [1, 7, 4], [5, 6], [8, 9], []]

******************** Human Inspection on HC_result Ended ********************


# update model prediction with labels and new cfg by hyper-param optimization

In [8]:
cfg_path = osp.join(os.getcwd(), 'setting_new.yml')
with open(cfg_path, 'r') as fp:
    cfg = yaml.safe_load(fp)
cfg

{'event_weights': [], 'max_K': 10000, 'remove_ids': ['11']}

In [9]:
xs, ys = get_data(path=osp.join(os.getcwd(), 'data_true.json'))
rev_xs = prep_data(xs, cfg['remove_ids'], cfg['max_K'])
clusters_true = get_cluster(ys)

print(clusters_true)
print(ys)

print(xs[1])
print(rev_xs[1])

[[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
[0, 0, 0, 0, 1, 1, 1, 2, 2, 2]
['9', '10', '4', '11', '5', '11', '6', '7', '11', '11']
['7', '6', '5', '4', '10', '9']


In [10]:
c = 1
o = 1
d_threshold = 0.1

hc = HierCluster(distance_threshold=d_threshold, c=c, o=o)
clusters_pred = hc.run(rev_xs)

print(clusters_pred)

[[0, 2, 3, 1], [4, 6, 5], [7], [8], [9]]


In [11]:
best_param, best_pred = hyper_opt(rev_xs, clusters_true)
print(best_param)
print(best_pred)

{'c': 0.9, 'o': 0.9, 'd': 0.7000000000000001}
[[0, 2, 3, 1], [4, 6, 5], [7, 9, 8]]


In [12]:
dm = HITL(cfg, best_pred, xs, save_cfg_path="", save_res_path="")
dm.run()

******************** Human Inspection on HC_result Started ********************
In cluster id 1:
The sample 4's content: ['1', '2', '3', '4', '1', '2', '11', '11', '3', '11']
The sample 6's content: ['2', '3', '4', '1', '2', '3']
The sample 5's content: ['1', '2', '3', '4', '2', '3']
******************** Human Inspection on HC_result Ended ********************
