### Boilerplate

In [19]:
import tqdm, numpy as np
import matplotlib.pyplot as plt
import utils
import umap
import hdbscan
import matplotlib.pyplot as plt
import logging
from bson.objectid import ObjectId
import gc
import tasks
from sklearn.preprocessing import StandardScaler
import numba
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances
import umap.plot
from sklearn.cluster import KMeans
import time

# for jupyter notebook widgets
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
from ipywidgets import interact, interactive

In [2]:
# connect to database
db = utils.connect()
db

Database(MongoClient(host=['20.220.215.35:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', authmechanism='SCRAM-SHA-256', connecttimeoutms=50000, serverselectiontimeoutms=50000, directconnection=True, replicaset='rs0'), 'aita')

### Setup Human Clusters

In [3]:
# hardcoded group id strings
group_id_strings = ['63901a89e189962b660959cf', '63901a92931eeac91c9924a1', '63901a96e189962b660959d3']

# convert to objectId's
group_ids = [ObjectId(str(id)) for id in group_id_strings]

# retrieve groups from database
groups = list(db.groups.find({"_id":{"$in" : group_ids}}))
print("Retrieved " + str(len(groups)) + " groups from database")

Retrieved 3 groups from database


In [4]:
groups[0]['history'][0]['included_documents']

['hetv62', 'lwd55z', 'dhbdpv', 'eyj0sv']

In [5]:
# projection here to only include the fields we want
projection = {'id': 1, 'textVector': 1}
projection

{'id': 1, 'textVector': 1}

### Create Training Set

#### Save & Create Training Data

##### Using First Group's Teleoscope Ordering

Change Raw cells below to Code if you need to reload document ids / vectors

##### Using All Docs

In [6]:
def cacheClusteringData(db):
    """
    Check to see if distance matrix and list of document ids is cached in ~/embeddings
    
    input:
        db: mongoDB connection
    output:
        dm: distance matrix
        ids: list of document ids
    """
    from pathlib import Path
    npzpath = Path('/clustering.npz').expanduser()
    
    if npzpath.exists():
        print("Documents have been cached, retrieving now.")
        loaded = np.load(npzpath.as_posix(), allow_pickle=False)
        dm = loaded['dist_matrix']
        ids = loaded['doc_ids'].tolist()
    
    else:
        print("Documents are not cached, building cache now.")
        # db = utils.connect()
        allDocuments = utils.getAllDocuments(db, projection={'id':1, 'textVector':1, '_id':0}, batching=True, batchSize=10000)
        ids = [x['id'] for x in allDocuments]
        print(f'There are {len(ids)} ids in documents.')

        vecs = np.array([x['textVector'] for x in allDocuments])
        dm = euclidean_distances(vecs)
        print(f'The distance matrix has shape: {dm.shape}')

        np.savez(npzpath.as_posix(), dist_matrix=dm, doc_ids=ids)
    
    return dm, ids

##### Using average teleoscope

In [7]:
loaded = np.load('all_order_docs.npz')
document_ids = loaded['doc_ids'].tolist()
document_vectors = loaded['doc_vecs']
len(document_ids)

347807

In [8]:
document_vectors.shape

(347807, 512)

In [9]:
teleo_vecs = []
for group in groups:

    teleoscope_oid = group["teleoscope"]
    teleoscope = db.teleoscopes.find_one({"_id": ObjectId(str(teleoscope_oid))})
    teleo_vecs.append(teleoscope["history"][0]["stateVector"])

teleo_vecs = np.array(teleo_vecs)

In [10]:
teleo_vecs.shape

(3, 512)

In [11]:
avg_vec = np.average(teleo_vecs, axis=0)
avg_vec.shape

(512,)

In [12]:
vecs = utils.calculateSimilarity(document_vectors, avg_vec)
ids = utils.rankDocumentsBySimilarity(document_ids, vecs)
ids[0]

('ia4w5v', 0.696844353778599)

#### Load Training Data

##### Using First Group's Teleoscope Ordering

In [6]:
loaded = np.load('teleo_order_docs.npz')
document_ids = loaded['doc_ids'].tolist()
document_vectors = loaded['doc_vecs']
len(document_ids)

10000

##### Using Entire Set

#### Append documents in human clusters

In [7]:
i = 0
group_doc_indices = {}
for group in groups:
    
    # grab latest history item for each group
    group_document_ids = group["history"][0]["included_documents"]
    
    indices = []
    
    for id in group_document_ids:
        
        try:
            document_ids.index(id)
        
        except:
            document = db.documents.find_one({"id": id}, projection=projection)
            document_ids.append(id)
            vector = np.array(document["textVector"]).reshape((1, 512))
            document_vectors = np.append(document_vectors, vector, axis=0)
            
        finally:
            indices.append(document_ids.index(id))
    
    group_doc_indices[group["history"][0]["label"]] = indices
    
    print(f'\nAdding group {i}')
    print("Document ids has the shape: ", len(document_ids))
    print("Document vectors has the shape: ", document_vectors.shape)

            

    i += 1

print(group_doc_indices)


Adding group 0
Document ids has the shape:  10000
Document vectors has the shape:  (10000, 512)

Adding group 1
Document ids has the shape:  10005
Document vectors has the shape:  (10005, 512)

Adding group 2
Document ids has the shape:  10010
Document vectors has the shape:  (10010, 512)
{'wifi': [5630, 7789, 2801, 3965], 'password': [10000, 10001, 10002, 10003, 6135, 9393, 10004], 'security': [10005, 10006, 10007, 10008, 10009]}


### Clustering

#### UMAP

##### Create Distance Matrix

In [8]:
# using sklean euclidean distances
dm = euclidean_distances(document_vectors)
dm.shape

(10010, 10010)

##### Map 0 Distances

In [9]:
group_doc_indices

{'wifi': [5630, 7789, 2801, 3965],
 'password': [10000, 10001, 10002, 10003, 6135, 9393, 10004],
 'security': [10005, 10006, 10007, 10008, 10009]}

In [10]:
for group in group_doc_indices:
    
    indices = group_doc_indices[group]
    size = range(len(indices))

    for _i in size:
        i = indices[_i]

        for _j in size:
            j = indices[_j]
            dm[i, j] = 0 

In [11]:
# sanity check to make sure two docs in the same human cluster are distance 0
i = group_doc_indices['password'][0]
j = group_doc_indices['password'][3]
dm[i,j] == 0

True

##### Reduction

In [12]:
umap_embeddings = umap.UMAP(
    verbose = True,         # for logging
    metric = "precomputed", # use distance matrix
    n_components = 30,      # reduce to n_components dimensions (2:100)
    # n_neighbors = 10,     # local (small n ~2) vs. global (large n ~100) structure 
    min_dist = 0.0,         # minimum distance apart that points are allowed (0.0:0.99)
).fit_transform(dm)

  warn("using precomputed metric; inverse_transform will be unavailable")


UMAP(metric='precomputed', min_dist=0.0, n_components=30, verbose=True)
Tue Feb 28 11:11:50 2023 Construct fuzzy simplicial set
Tue Feb 28 11:11:50 2023 Finding Nearest Neighbors
Tue Feb 28 11:11:52 2023 Finished Nearest Neighbor Search
Tue Feb 28 11:11:55 2023 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Tue Feb 28 11:11:59 2023 Finished embedding


In [13]:
umap_embeddings.shape

(10010, 30)

#### HDBSCAN

In [14]:
hdbscan_labels = hdbscan.HDBSCAN(
    min_cluster_size = 10,              # n-neighbors needed to be considered a cluster (0:50 df=5)
    # min_samples = 5,                  # how conservative clustering will be, larger is more conservative (more outliers) (df=None)
    cluster_selection_epsilon = 0.2,    # have large clusters in dense regions while leaving smaller clusters small
                                        # merge clusters if inter cluster distance is less than thres (df=0)
).fit_predict(umap_embeddings)

print(f'Num Clusters = {max(hdbscan_labels)+1} + outliers')

Num Clusters = 44 + outliers


In [15]:
# examine matchings between human labelled clusters and machine labelled clusters
for group in group_doc_indices:
    print(f'\nLabels for group = {group}\n')
    for index in group_doc_indices[group]:
        print(document_ids[index], hdbscan_labels[index])


Labels for group = wifi

hetv62 40
lwd55z 40
dhbdpv 40
eyj0sv 40

Labels for group = password

g3y7dc 40
j8nzf5 40
fs0vuw 40
q9zlgr 40
ia4w5v 40
ruuxs1 40
hw16a9 40

Labels for group = security

mnqbp9 40
spk73c 40
qqwzth 40
dfon3v 40
bqafew 40


### Results
Are human clusters maintained?

In [16]:
given_labels = {}

for group in group_doc_indices:
    
    labels = hdbscan_labels[group_doc_indices[group]] 
    correct_label = max(labels)
    
    if -1 in labels:
        for i in range(len(labels)):
            if labels[i] == -1:
                index = group_doc_indices[group][i]
                hdbscan_labels[index] = correct_label
    
    given_labels[group] = correct_label
               
given_labels

{'wifi': 40, 'password': 40, 'security': 40}

In [18]:
leo = ObjectId('63868b5fb3cde877de34c27d') 
paul = ObjectId('637ee569d1259b1565f7e97e') 
userid = leo
if db.clusters.count_documents({"history.user": userid}, limit=1):
    print(f'Clusters for user exists. Delete all.')
    db.clusters.delete_many({"history.user": userid})
print(f'No clusters for user. Ready to populate.')

Clusters for user exists. Delete all.
No clusters for user. Ready to populate.


In [17]:
def get_label(hdbscan_label, given_labels):
    """
    if -1:              label = 'outliers'   color = #700c1d
    if human cluster:   label = human label  color = human color? or #15540d
    if machine cluster: label = topic guess  color = #737373

    """
    check = more = False
    
    if hdbscan_label == -1:
        return 'outliers', '#700c1d'

    for _name in given_labels:

        label = given_labels[_name]
        
        if (hdbscan_label == label):
            if more:
                name += " & " + _name
            else:
                name = _name
                more = check = True
    
    if check:
        return name, '#15540d'

    return 'machine', '#737373'

In [176]:
import spacy

nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

In [218]:
def get_topic(label_ids):
    
    docs = [] 
    
    label_ids = label_ids.tolist()
    cursor = db.documents.find({"id":{"$in": label_ids}})

    for document in tqdm.tqdm(cursor):
        docs.append(document["text"])
        
    docs_pp = [preprocess(text) for text in nlp.pipe(docs)]

    from sklearn.feature_extraction.text import CountVectorizer

    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(docs_pp)

    from sklearn.decomposition import LatentDirichletAllocation

    lda = LatentDirichletAllocation(
        n_components=1, learning_method="batch", max_iter=10
    )
    
    document_topics = lda.fit_transform(X)
    sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
    feature_names = np.array(vec.get_feature_names_out())
    
    return feature_names[sorting[0][0]] + " " + feature_names[sorting[0][1]]

In [219]:
# code by Dr. Varada Kolhatkar adapted from cpsc330
def preprocess(
    doc,
    min_token_len=2,
    irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP", "SPACE"],
):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.

    Parameters
    -------------
    doc : (spaCy doc object)
        the spacy doc object of the text
    min_token_len : (int)
        min_token_length required
    irrelevant_pos : (list)
        a list of irrelevant pos tags

    Returns
    -------------
    (str) the preprocessed text
    """

    clean_text = []

    for token in doc:
        if (
            token.is_stop == False  # Check if it's not a stopword
            and len(token) > min_token_len  # Check if the word meets minimum threshold
            and token.pos_ not in irrelevant_pos
        ):  # Check if the POS is in the acceptable POS tags
            lemma = token.lemma_  # Take the lemma of the word
            clean_text.append(lemma.lower())
    return " ".join(clean_text)

In [220]:
clusters = {}

for hdbscan_label in set(hdbscan_labels):
        
        # array of indices of documents with current hdbscan label
        document_indices_array = np.where(hdbscan_labels == hdbscan_label)[0]
        
        # all document_ids as array
        ids = np.array(document_ids)
        
        # array of ids of documents with current hdbscan label 
        label_ids = ids[document_indices_array]

        # create list of document ids that are in current hdbscan label
        documents = label_ids.tolist()
        
        # create appropriate label for current hdbscan label
        _label, _color = get_label(hdbscan_label, given_labels)
        
        # learn a topic label for machine clusters
        if _label == 'machine':
            limit = min(20, len(label_ids))
            _label = get_topic(label_ids[:limit])
        
        # add label and respective document ids to clusters dictionary
        clusters[_label] = documents

20it [00:00, 61.19it/s]
20it [00:00, 75.82it/s]
12it [00:00, 73.32it/s]
20it [00:00, 81.63it/s]
20it [00:00, 77.68it/s]
20it [00:00, 77.48it/s]
19it [00:00, 78.40it/s]
14it [00:00, 89.19it/s]
20it [00:00, 82.12it/s]
20it [00:00, 65.54it/s]
20it [00:00, 74.39it/s]
20it [00:00, 81.64it/s]
18it [00:00, 73.26it/s]
17it [00:00, 70.39it/s]
20it [00:00, 78.01it/s]
17it [00:00, 70.71it/s]
16it [00:00, 69.70it/s]
20it [00:00, 57.49it/s]
20it [00:00, 51.90it/s]
20it [00:00, 73.17it/s]
12it [00:00, 73.06it/s]
14it [00:00, 81.32it/s]
20it [00:00, 81.43it/s]
14it [00:00, 75.98it/s]
20it [00:00, 77.89it/s]
20it [00:00, 85.21it/s]
13it [00:00, 58.49it/s]
20it [00:00, 78.64it/s]
20it [00:00, 96.75it/s]
13it [00:00, 78.86it/s]
20it [00:00, 87.25it/s]
20it [00:00, 98.50it/s]
20it [00:00, 108.96it/s]
20it [00:00, 79.00it/s]
20it [00:00, 104.61it/s]
20it [00:00, 38.81it/s]
20it [00:00, 118.08it/s]
18it [00:00, 103.12it/s]
20it [00:00, 114.14it/s]
10it [00:00, 106.17it/s]
20it [00:00, 104.36it/s]
20it [00:

In [221]:
print(clusters.keys())

dict_keys(['car parking', 'dog time', 'gym workout', 'smell smoke', 'smell room', 'work coworker', 'speed car', 'bus stop', 'drive work', 'adhd tell', 'cat roommate', 'door lock', 'work day', 'stream game', 'room roommate', 'watch want', 'brother tell', 'roommate room', 'food roommate', 'roommate dish', 'game play', 'friend tell', 'play game', 'play ball', 'class work', 'friend play', 'pay tell', 'time food', 'computer laptop', 'work time', 'tell brother', 'room brother', 'job time', 'roommate friend', 'music time', 'work music', 'room music', 'friend roommate', 'sleep night', 'room noise', 'sleep room', 'wifi & password & security', 'outliers'])


In [222]:
len(clusters)

43

### Visualize

In [None]:
twod_umap = umap.UMAP(
    verbose = True,         # for logging
    metric = "precomputed", # use distance matrix
    n_components = 2,      # reduce to n_components dimensions (2:100)
    # n_neighbors = 10,     # local (small n ~2) vs. global (large n ~100) structure 
    min_dist = 0.0,         # minimum distance apart that points are allowed (0.0:0.99)
).fit_transform(dm)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,            # num of neighbouring points needed to be considered a cluster
    min_samples=None,               # how conservative clustering will be. larger is more conservative.
    cluster_selection_epsilon=0.2,   # what it means for points to be “close”
).fit(twod_umap)

# hdbscan_labels = hdbscan.HDBSCAN(
#     min_cluster_size=10,            # num of neighbouring points needed to be considered a cluster
#     min_samples=None,               # how conservative clustering will be. larger is more conservative.
#     cluster_selection_epsilon=0.2,   # what it means for points to be “close”
# ).fit_predict(twod_umap)

print(f'Num Clusters = {max(hdbscan_labels)+1} + outliers')

In [None]:
clusterer.single_linkage_tree_.plot()

In [None]:
umap.plot.points(mapper, labels=hdbscan_labels)