### Boilerplate

In [126]:
import tqdm, numpy as np
import matplotlib.pyplot as plt
import utils
import umap
import hdbscan
import matplotlib.pyplot as plt
import logging
from bson.objectid import ObjectId
import gc
import tasks
from sklearn.preprocessing import StandardScaler
import numba
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances

In [2]:
# for jupyter notebook widgets
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
%matplotlib inline

In [3]:
# connect to database
db = utils.connect()
db

Database(MongoClient(host=['20.220.215.35:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', authmechanism='SCRAM-SHA-256', connecttimeoutms=50000, serverselectiontimeoutms=50000, directconnection=True, replicaset='rs0'), 'aita')

### Setup Human Clusters

In [4]:
# hardcoded group id strings
group_id_strings = ['63901a89e189962b660959cf', '63901a92931eeac91c9924a1', '63901a96e189962b660959d3']

# convert to objectId's
group_ids = [ObjectId(str(id)) for id in group_id_strings]

# retrieve groups from database
groups = list(db.groups.find({"_id":{"$in" : group_ids}}))
print("Retrieved " + str(len(groups)) + " groups from database")

Retrieved 3 groups from database


In [5]:
groups[0]['history'][0]['included_documents']

['hetv62', 'lwd55z', 'dhbdpv', 'eyj0sv']

In [6]:
# projection here to only include the fields we want
projection = {'id': 1, 'textVector': 1}
projection

{'id': 1, 'textVector': 1}

### Create Training Set

Options to define training set:
1. Use the first groups teleoscope ordering
2. Use all documents
3. Create a new teleo vec from all documents in human clusters

Embeddings
- save training set upon creation (check if created or not)
    - we need to save both ids and vectors for recall at the end

#### Save & Create Training Data

##### Using First Group's Teleoscope Ordering

Change Raw cells below to Code if you need to reload document ids / vectors

#### Load Training Data

##### Using First Group's Teleoscope Ordering

In [144]:
loaded = np.load('teleo_order_docs.npz')
document_ids = loaded['doc_ids'].tolist()
document_vectors = loaded['doc_vecs']

#### Append documents in human clusters

In [194]:
i = 0
for group in groups:
    
    # grab latest history item for each group
    group_document_ids = group["history"][0]["included_documents"]
    
    indices = []
    
    for id in group_document_ids:
        
        try:
            index = document_ids.index(id)
            indices.append(index)
        
        except:
            document = db.documents.find_one({"id": id}, projection=projection)
            document_ids.append(id)
            vector = np.array(document["textVector"]).reshape((1, 512))
            document_vectors = np.append(document_vectors, vector, axis=0)
            
            index = document_ids.index(id)
            indices.append(index)
    i += 1
    
    print(f'\nAdding group {i}')
    print("Document ids has the shape: ", len(document_ids))
    print("Document vectors has the shape: ", document_vectors.shape)


Adding group 1
Document ids has the shape:  10010
Document vectors has the shape:  (10010, 512)

Adding group 2
Document ids has the shape:  10010
Document vectors has the shape:  (10010, 512)

Adding group 3
Document ids has the shape:  10010
Document vectors has the shape:  (10010, 512)


### Clustering

#### UMAP

Notes
- DR by default hyperparameters
- low_memory uses less memory but longer compute time
- verbose just logs info

TODO
- define custom metric
- what args are passed to custom metric
- how do we check if args are a subset of group?
- can we use conditional or do we need matrix

ISSUES
- general 
    - how to check if i and j are in the same group
    - i and j are vectors
    - is it possible to pass i and j as indices instead of vectors?
- conditional metric
    - groups contain ids
    - maybe search for indices of both id and vec then compare? seems expensive
    - maybe translate vec to id (or visversa) then compare? seems expensive
    - add a new dimension that is the id of vector for easy lookup
- matrix metric
    - use metric='precomputed'
    - expensive as fuck to build matrix
    - how to reduce cost?

##### Custom Metric

##### Organize Document IDs & Indices in Human Clusters

In [147]:
# use dict or ndarray?
group_doc_indices = {}
group_doc_ids = {}

for group in range(len(groups)):
    
    curr_id = groups[group]['history'][0]['included_documents']
    
    indices = []
    ids = []
    
    for i in curr_id:
        indices.append(document_ids.index(i))
        ids.append(i)
    
    group_doc_indices[group] = indices
    group_doc_ids[group] = ids
    
print(group_doc_indices)
print(group_doc_ids)

{0: [5630, 7789, 2801, 3965], 1: [10000, 10001, 10002, 10003, 6135, 9393, 10004], 2: [10005, 10006, 10007, 10008, 10009]}
{0: ['hetv62', 'lwd55z', 'dhbdpv', 'eyj0sv'], 1: ['g3y7dc', 'j8nzf5', 'fs0vuw', 'q9zlgr', 'ia4w5v', 'ruuxs1', 'hw16a9'], 2: ['mnqbp9', 'spk73c', 'qqwzth', 'dfon3v', 'bqafew']}


##### Create Distance Matrix

In [165]:
# using sklean euclidean distances
dm = euclidean_distances(document_vectors)

##### Update Distances for Documents within the same human cluster


In [168]:
for group in range(len(groups)):
    
    docs = groups[group]['history'][0]['included_documents']

    for i in range(len(docs)):
        
        index_i = document_ids.index(docs[i])
        
        for j in range(len(docs)):
            
            if (i != j):
            
                index_j = document_ids.index(docs[j])
                dm[index_i, index_j] = 0 # 0 if euclidean
                dm[index_j, index_i] = 0

In [182]:
# sanity check to make sure two docs in the same human cluster are distance 0
i = group_doc_indices[0][0]
j = group_doc_indices[0][1]
dm[i,j] == 0

True

##### Reduction

In [170]:
fitter = umap.UMAP(metric='precomputed', verbose=True, low_memory=True, n_components=5).fit(dm)
embedding = fitter.embedding_
umap_embeddings = fitter.transform(dm)

  warn("using precomputed metric; inverse_transform will be unavailable")


UMAP(metric='precomputed', n_components=5, verbose=True)
Sat Jan 21 21:43:10 2023 Construct fuzzy simplicial set
Sat Jan 21 21:43:11 2023 Finding Nearest Neighbors
Sat Jan 21 21:43:11 2023 Finished Nearest Neighbor Search
Sat Jan 21 21:43:12 2023 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Sat Jan 21 21:43:15 2023 Finished embedding


In [180]:
reduced_i = embedding[5630]
reduced_j = embedding[7789]

cos = cosine_similarity([red_i],[red_j])[0][0]
euc = euclidean_distances([red_i],[red_j])[0][0]
print(f'Checking to see if similar distances are maintained after reduction\n')
print(f'Cosine Similarity: {cos}')
print(f'Euclidean Distance: {euc}')

Checking to see if similar distances are maintained after reduction

Cosine Similarity: 0.9999080300331116
Euclidean Distance: 0.1914035677909851


#### HDBSCAN

Notes
- Clustering by default hyperparameters
- Resultant labels are in the same ordering as data

TODO
- User parameterize hyperparams
- Use custom metric hyperparam here too?

In [173]:
clusterer = hdbscan.HDBSCAN()
hdbscan_labels = clusterer.fit_predict(umap_embeddings)

In [174]:
label_array = np.array(hdbscan_labels)
clusters = {}

# iterate over given labels
for hdbscan_label in set(hdbscan_labels):
        
        # find indices of documents for a current label
        document_indices_scalar = np.where(label_array == hdbscan_label)[0]
        document_indices = [int(i) for i in document_indices_scalar]
        
        # create list of document ids that are in current label
        documents = []
        
        for i in document_indices:
            documents.append(document_ids[i])
        
        # add label and respective document ids to clusters dictionary
        clusters[hdbscan_label] = documents

#### Results
Are human clusters maintained?

In [175]:
clusters.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, -1])

In [176]:
# examine matchings between human labelled clusters and machine labelled clusters
for group in group_doc_indices:
    print(f'\nLabels for group = {group}\n')
    for index in group_doc_indices[group]:
        print(document_ids[index], hdbscan_labels[index])


Labels for group = 0

hetv62 158
lwd55z 158
dhbdpv 158
eyj0sv 158

Labels for group = 1

g3y7dc 153
j8nzf5 153
fs0vuw 153
q9zlgr 153
ia4w5v 153
ruuxs1 153
hw16a9 153

Labels for group = 2

mnqbp9 150
spk73c 150
qqwzth 150
dfon3v 150
bqafew 150


### Toy Data Experiments

In [98]:
penguins = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv")
penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [99]:
# remove categorical features
penguin_data = penguins[
    [
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
].values

# scaling
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)

scaled_penguin_data

array([[-0.89604189,  0.7807321 , -1.42675157, -0.56847478],
       [-0.82278787,  0.11958397, -1.06947358, -0.50628618],
       [-0.67627982,  0.42472926, -0.42637319, -1.1903608 ],
       ...,
       [ 1.02687621,  0.52644436, -0.56928439, -0.53738048],
       [ 1.24663828,  0.93330475,  0.64546078, -0.13315457],
       [ 1.13675725,  0.7807321 , -0.2120064 , -0.53738048]])

In [100]:
group1 = scaled_penguin_data[0:2]
human_cluster

array([[-0.89604189,  0.7807321 , -1.42675157, -0.56847478],
       [-0.82278787,  0.11958397, -1.06947358, -0.50628618]])

In [25]:
type(scaled_penguin_data)
type(human_cluster)

numpy.ndarray

In [26]:
a = scaled_penguin_data[0]
b = scaled_penguin_data[1]
c = scaled_penguin_data[5]

type(a)

numpy.ndarray

In [None]:
# if the reduction works, the distance between index 0 and 1 should be 0 
a = embedding[0]
b = embedding[1]
np.dot(a,b)/(norm(a)*norm(b))