### Boilerplate

In [71]:
import tqdm, numpy as np
import matplotlib.pyplot as plt
import utils
import umap
import hdbscan
import matplotlib.pyplot as plt
import logging
from bson.objectid import ObjectId
import gc
import tasks
from sklearn.preprocessing import StandardScaler
import numba
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances
import umap.plot
from sklearn.cluster import KMeans
import time
import gridfs
from pathlib import Path
import pickle


# for jupyter notebook widgets
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
from ipywidgets import interact, interactive

In [2]:
# connect to database
db = utils.connect()
db

Database(MongoClient(host=['20.220.215.35:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', authmechanism='SCRAM-SHA-256', connecttimeoutms=50000, serverselectiontimeoutms=50000, directconnection=True, replicaset='rs0'), 'aita')

### Setup Human Clusters

In [3]:
# hardcoded group id strings
group_id_strings = ['63901a89e189962b660959cf', '63901a92931eeac91c9924a1', '63901a96e189962b660959d3']

# convert to objectId's
group_ids = [ObjectId(str(id)) for id in group_id_strings]

# retrieve groups from database
groups = list(db.groups.find({"_id":{"$in" : group_ids}}))
print("Retrieved " + str(len(groups)) + " groups from database")

Retrieved 3 groups from database


In [4]:
groups[0]['history'][0]['included_documents']

['hetv62', 'lwd55z', 'dhbdpv', 'eyj0sv']

In [5]:
# projection here to only include the fields we want
projection = {'id': 1, 'textVector': 1}
projection

{'id': 1, 'textVector': 1}

### Create Training Set

#### Save & Create Training Data

##### Using First Group's Teleoscope Ordering

Change Raw cells below to Code if you need to reload document ids / vectors

##### Using All Docs

##### Using average teleoscope

In [78]:
# this would actually be from ~/embeddings as above
loaded = np.load('all_order_docs.npz')
all_doc_ids = loaded['doc_ids'].tolist()
all_doc_vecs = loaded['doc_vecs']
len(all_doc_ids)

10000

In [7]:
limit = min(10000, len(all_doc_ids))

In [11]:
teleo_vecs = []
for group in groups:

    teleoscope_oid = group["teleoscope"]
    teleoscope = db.teleoscopes.find_one({"_id": ObjectId(str(teleoscope_oid))})
    teleo_vecs.append(teleoscope["history"][0]["stateVector"])

teleo_vecs = np.array(teleo_vecs)

In [12]:
teleo_vecs.shape

(3, 512)

In [13]:
avg_vec = np.average(teleo_vecs, axis=0)
avg_vec.shape

(512,)

In [14]:
scores = utils.calculateSimilarity(all_doc_vecs, avg_vec)
scores[:5]

array([0.23058389, 0.31979471, 0.30451099, 0.30814943, 0.29490711])

In [80]:
ids = utils.rankDocumentsBySimilarity(all_doc_ids, scores)[:limit]
document_ids = [i for i, j in ids]
document_ids[:5]

['ia4w5v', 'flo65r', 'sgt76q', 'fzuf7q', 'bqafew']

In [82]:
indices = [all_doc_ids.index(i) for i in document_ids]
indices[:5]

[210153, 156688, 330158, 162159, 43056]

In [83]:
document_vectors = np.array([all_doc_vecs[i] for i in indices])
document_vectors.shape

(10000, 512)

#### Load Training Data

##### Using First Group's Teleoscope Ordering

##### Using Entire Set

#### Append documents in human clusters

In [84]:
i = 0
group_doc_indices = {}
for group in groups:
    
    # grab latest history item for each group
    group_document_ids = group["history"][0]["included_documents"]
    
    indices = []
    
    for id in group_document_ids:
        
        try:
            document_ids.index(id)
        
        except:
            document = db.documents.find_one({"id": id}, projection=projection)
            document_ids.append(id)
            vector = np.array(document["textVector"]).reshape((1, 512))
            document_vectors = np.append(document_vectors, vector, axis=0)
            
        finally:
            indices.append(document_ids.index(id))
    
    group_doc_indices[group["history"][0]["label"]] = indices
    
    print(f'\nAdding group {i}')
    print("Document ids has the shape: ", len(document_ids))
    print("Document vectors has the shape: ", document_vectors.shape)

            

    i += 1

print(group_doc_indices)


Adding group 0
Document ids has the shape:  10000
Document vectors has the shape:  (10000, 512)

Adding group 1
Document ids has the shape:  10001
Document vectors has the shape:  (10001, 512)

Adding group 2
Document ids has the shape:  10002
Document vectors has the shape:  (10002, 512)
{'wifi': [7, 554, 243, 28], 'password': [161, 311, 10000, 474, 0, 73, 89], 'security': [932, 51, 578, 10001, 4]}


### Clustering

#### UMAP

##### Create Distance Matrix

In [85]:
# using sklean euclidean distances
dm = euclidean_distances(document_vectors)
dm.shape

(10002, 10002)

##### Map 0 Distances

In [86]:
group_doc_indices

{'wifi': [7, 554, 243, 28],
 'password': [161, 311, 10000, 474, 0, 73, 89],
 'security': [932, 51, 578, 10001, 4]}

In [87]:
for group in group_doc_indices:
    
    indices = group_doc_indices[group]
    size = range(len(indices))

    for _i in size:
        i = indices[_i]

        for _j in size:
            j = indices[_j]
            dm[i, j] = 0 

In [88]:
# sanity check to make sure two docs in the same human cluster are distance 0
i = group_doc_indices['password'][0]
j = group_doc_indices['password'][3]
dm[i,j] == 0

True

##### Reduction

In [94]:
umap_embeddings = umap.UMAP(
    verbose = True,         # for logging
    metric = "precomputed", # use distance matrix
    n_components = 30,      # reduce to n_components dimensions (2:100)
    # n_neighbors = 10,     # local (small n ~2) vs. global (large n ~100) structure 
    min_dist = 0.0,         # minimum distance apart that points are allowed (0.0:0.99)
).fit_transform(dm)

  warn("using precomputed metric; inverse_transform will be unavailable")


UMAP(metric='precomputed', min_dist=0.0, n_components=30, verbose=True)
Tue Feb 28 23:23:49 2023 Construct fuzzy simplicial set
Tue Feb 28 23:23:49 2023 Finding Nearest Neighbors
Tue Feb 28 23:23:50 2023 Finished Nearest Neighbor Search
Tue Feb 28 23:23:50 2023 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Tue Feb 28 23:23:54 2023 Finished embedding


In [95]:
umap_embeddings.shape

(10002, 30)

#### HDBSCAN

In [96]:
hdbscan_labels = hdbscan.HDBSCAN(
    min_cluster_size = 10,              # n-neighbors needed to be considered a cluster (0:50 df=5)
    # min_samples = 5,                  # how conservative clustering will be, larger is more conservative (more outliers) (df=None)
    cluster_selection_epsilon = 0.2,    # have large clusters in dense regions while leaving smaller clusters small
                                        # merge clusters if inter cluster distance is less than thres (df=0)
).fit_predict(umap_embeddings)

print(f'Num Clusters = {max(hdbscan_labels)+1} + outliers')

Num Clusters = 52 + outliers


In [97]:
# examine matchings between human labelled clusters and machine labelled clusters
for group in group_doc_indices:
    print(f'\nLabels for group = {group}\n')
    for index in group_doc_indices[group]:
        print(document_ids[index], hdbscan_labels[index])


Labels for group = wifi

hetv62 38
lwd55z 38
dhbdpv 38
eyj0sv 38

Labels for group = password

g3y7dc 36
j8nzf5 36
fs0vuw 36
q9zlgr 36
ia4w5v 36
ruuxs1 36
hw16a9 36

Labels for group = security

mnqbp9 -1
spk73c -1
qqwzth -1
dfon3v -1
bqafew -1


### Results
Are human clusters maintained?

In [98]:
given_labels = {}

for group in group_doc_indices:
    
    labels = hdbscan_labels[group_doc_indices[group]] 
    correct_label = max(labels)
    
    if -1 in labels:
        for i in range(len(labels)):
            if labels[i] == -1:
                index = group_doc_indices[group][i]
                hdbscan_labels[index] = correct_label
    
    given_labels[group] = correct_label
               
given_labels

{'wifi': 38, 'password': 36, 'security': -1}

In [79]:
leo = ObjectId('63868b5fb3cde877de34c27d') 
userid = leo

cursor = db.clusters.find(
    { "history.user" : userid},
    projection = {'_id': 1, 'teleoscope': 1},
)

In [87]:
cluster_id = []
cluster_teleo_id = []

for cluster in tqdm.tqdm(cursor):
    cluster_id.append(cluster["_id"])
    cluster_teleo_id.append(cluster["teleoscope"])

45it [00:00, 184320.00it/s]


In [89]:
teleo_oid = ObjectId('63fe59c12c22e5451a66548b') # telescope 411
teleo = db.teleoscopes.find_one({"_id": teleo_oid})
teleo_file = teleo["history"][0]['ranked_document_ids']
teleo_file

ObjectId('63fe59c54437e8bdaf6655b6')

In [105]:
orderings = 'AVG'

match orderings:
    case "AVG":
        print('avg')
    case "FIRST":
        print('first')
    case "ALL":
        print('no')

avg


In [96]:
def clean_mongodb(db, userid):
    """
    Check to see if user has already built clusters.
    If so, need to delete clusters and associated teleoscope items

    Parameters
    -------------
    db : 
        mongoDB connection
    userid:
        represents ObjectId as str
    """
    namespace = "teleoscopes" # teleoscopes.chunks, teleoscopes.files
    fs = gridfs.GridFS(db, namespace)

    if db.clusters.count_documents(
        { "history.user": ObjectId(str(userid))}, 
        limit=1,
    ):
        
        logging.info(f'Clusters for user exists. Delete all.')

        cursor = db.clusters.find(
            { "history.user" : ObjectId(str(userid))},
            projection = {'_id': 1, 'teleoscope': 1},
        )    

        for cluster in tqdm.tqdm(cursor):

            # cluster teleoscope
            teleo_oid = cluster["teleoscope"]
            teleo = db.teleoscopes.find_one({"_id": teleo_oid})

            # associated teleoscope.files
            teleo_file = teleo["history"][0]['ranked_document_ids']

            # delete telescopes.chuncks and teleoscopes.files
            fs.delete(teleo_file)

            # delete teleoscope 
            db.teleoscopes.delete_one({"_id": teleo_oid})

            # delete cluster
            db.clusters.delete_one({"_id": cluster["_id"]})
    
    
    logging.info(f'No clusters for user. Ready to populate.')
    pass

In [97]:
leo = '63868b5fb3cde877de34c27d'
userid = leo
clean_mongodb(db, userid)

In [100]:
def get_label(hdbscan_label, given_labels):
    """
    if -1:              label = 'outliers'   color = #700c1d
    if human cluster:   label = human label  color = human color? or #15540d
    if machine cluster: label = topic guess  color = #737373

    """
    check = more = False
    
    if hdbscan_label == -1:
        return 'outliers', '#700c1d'

    for _name in given_labels:

        label = given_labels[_name]
        
        if (hdbscan_label == label):
            if more:
                name += " & " + _name
            else:
                name = _name
                more = check = True
    
    if check:
        return name, '#15540d'

    return 'machine', '#737373'

In [101]:
import spacy

nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

In [102]:
def get_topic(label_ids):
    
    docs = [] 
    
    label_ids = label_ids.tolist()
    cursor = db.documents.find({"id":{"$in": label_ids}})

    for document in tqdm.tqdm(cursor):
        docs.append(document["text"])
        
    docs_pp = [preprocess(text) for text in nlp.pipe(docs)]

    from sklearn.feature_extraction.text import CountVectorizer

    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(docs_pp)

    from sklearn.decomposition import LatentDirichletAllocation

    lda = LatentDirichletAllocation(
        n_components=1, learning_method="batch", max_iter=10
    )
    
    document_topics = lda.fit_transform(X)
    sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
    feature_names = np.array(vec.get_feature_names_out())
    
    return feature_names[sorting[0][0]] + " " + feature_names[sorting[0][1]]

In [103]:
# code by Dr. Varada Kolhatkar adapted from cpsc330
def preprocess(
    doc,
    min_token_len=2,
    irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP", "SPACE"],
):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.

    Parameters
    -------------
    doc : (spaCy doc object)
        the spacy doc object of the text
    min_token_len : (int)
        min_token_length required
    irrelevant_pos : (list)
        a list of irrelevant pos tags

    Returns
    -------------
    (str) the preprocessed text
    """

    clean_text = []

    for token in doc:
        if (
            token.is_stop == False  # Check if it's not a stopword
            and len(token) > min_token_len  # Check if the word meets minimum threshold
            and token.pos_ not in irrelevant_pos
        ):  # Check if the POS is in the acceptable POS tags
            lemma = token.lemma_  # Take the lemma of the word
            clean_text.append(lemma.lower())
    return " ".join(clean_text)

In [104]:
clusters = {}

for hdbscan_label in set(hdbscan_labels):
        
        # array of indices of documents with current hdbscan label
        document_indices_array = np.where(hdbscan_labels == hdbscan_label)[0]
        
        # all document_ids as array
        ids = np.array(document_ids)
        
        # array of ids of documents with current hdbscan label 
        label_ids = ids[document_indices_array]

        # create list of document ids that are in current hdbscan label
        documents = label_ids.tolist()
        
        # create appropriate label for current hdbscan label
        _label, _color = get_label(hdbscan_label, given_labels)
        
        # learn a topic label for machine clusters
        if _label == 'machine':
            limit = min(20, len(label_ids))
            _label = get_topic(label_ids[:limit])
        
        # add label and respective document ids to clusters dictionary
        clusters[_label] = documents

0it [00:54, ?it/s]


KeyboardInterrupt: 

In [None]:
print(clusters.keys())

In [222]:
len(clusters)

43

### Visualize

In [None]:
twod_umap = umap.UMAP(
    verbose = True,         # for logging
    metric = "precomputed", # use distance matrix
    n_components = 2,      # reduce to n_components dimensions (2:100)
    # n_neighbors = 10,     # local (small n ~2) vs. global (large n ~100) structure 
    min_dist = 0.0,         # minimum distance apart that points are allowed (0.0:0.99)
).fit_transform(dm)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,            # num of neighbouring points needed to be considered a cluster
    min_samples=None,               # how conservative clustering will be. larger is more conservative.
    cluster_selection_epsilon=0.2,   # what it means for points to be “close”
).fit(twod_umap)

# hdbscan_labels = hdbscan.HDBSCAN(
#     min_cluster_size=10,            # num of neighbouring points needed to be considered a cluster
#     min_samples=None,               # how conservative clustering will be. larger is more conservative.
#     cluster_selection_epsilon=0.2,   # what it means for points to be “close”
# ).fit_predict(twod_umap)

print(f'Num Clusters = {max(hdbscan_labels)+1} + outliers')

In [None]:
clusterer.single_linkage_tree_.plot()

In [None]:
umap.plot.points(mapper, labels=hdbscan_labels)