### Boilerplate

In [104]:
import tqdm, numpy as np
import matplotlib.pyplot as plt
import utils
import umap
import hdbscan
import matplotlib.pyplot as plt
import logging
from bson.objectid import ObjectId
import gc
import tasks
from sklearn.preprocessing import StandardScaler
import numba
from scipy.spatial import distance
import pandas as pd

In [105]:
# for jupyter notebook widgets
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
%matplotlib inline

In [106]:
# connect to database
db = utils.connect()
db

Database(MongoClient(host=['20.220.215.35:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', authmechanism='SCRAM-SHA-256', connecttimeoutms=50000, serverselectiontimeoutms=50000, directconnection=True, replicaset='rs0'), 'aita')

### Setup Human Clusters

In [107]:
# hardcoded group id strings
group_id_strings = ['63901a89e189962b660959cf', '63901a92931eeac91c9924a1', '63901a96e189962b660959d3']

# convert to objectId's
group_ids = [ObjectId(str(id)) for id in group_id_strings]

# retrieve groups from database
groups = list(db.groups.find({"_id":{"$in" : group_ids}}))
print("Retrieved " + str(len(groups)) + " groups from database")

Retrieved 3 groups from database


In [112]:
groups[0]['history'][0]['included_documents']

['hetv62', 'lwd55z', 'dhbdpv', 'eyj0sv']

### Create Training Set

Preprocessing

Options to define training set:
1. Use the first groups teleoscope ordering
2. Use all documents
3. Create a new teleo vec from all documents in human clusters

Embeddings
- save training set upon creation (check if created or not)
    - we need to save both ids and vectors for recall at the end

#### Using First Group's Teleoscope Ordering

In [12]:
# default to ordering documents relative to first group's teleoscope
teleoscope_oid = groups[0]["teleoscope"]
teleoscope_oid

ObjectId('63901a89e189962b660959ce')

In [13]:
teleoscope = db.teleoscopes.find_one({"_id": ObjectId(str(teleoscope_oid))})
#teleoscope

In [14]:
# projection here to only include the fields we want
projection = {'id': 1, 'textVector': 1}
projection

{'id': 1, 'textVector': 1}

In [19]:
# saved as ordered_documents.npz
# change to code cell if load is needed

# get Teleoscope from GridFS
all_ordered_documents = utils.gridfsDownload(db, "teleoscopes", ObjectId(str(teleoscope["history"][0]["ranked_document_ids"])))

# np.savez_compressed('all_ordered_documents', ord_docs=all_ordered_documents)
# all_ordered_documents = np.load('all_ordered_documents.npz')['ord_docs']

len(all_ordered_documents)

347807

In [22]:
# grab only subset of the ordered documents
limit = 10000
# TODO: does this line generate an out of bounds access?
ordered_documents = all_ordered_documents[0:limit]
limit = min(limit, len(ordered_documents))
limit

# cursor is a generator which means it yields a new doc one at a time
cursor = db.documents.find(
    # query
    {"id":{"$in": [document[0] for document in ordered_documents]}},
    projection=projection,
    # batch size means number of documents at a time taken from MDB, no impact on iteration
    batch_size=500
)
document_ids = []
document_vectors = []

# for large datasets, this will take a while. Would be better to find out whether the UMAP fns can 
# accept generators for lazy calculation
for document in tqdm.tqdm(cursor, total=limit):
    document_ids.append(document["id"])
    document_vectors.append(document["textVector"])

print("There are " + str(len(document_ids)) + " document ids.")
print("There are " + str(len(document_vectors)) + " document vectors.")

np.savez_compressed('teleo_order_docs', doc_ids=document_ids, doc_vecs=document_vectors)


100%|███████████████████████████████████| 10000/10000 [00:08<00:00, 1180.41it/s]


There are 10000 document ids.
There are 10000 document vectors.


### Load Training Data

#### Using First Group's Teleoscope Ordering

In [34]:
loaded = np.load('teleo_order_docs.npz')
document_ids = loaded['doc_ids'].tolist()
document_vectors = loaded['doc_vecs']

#### Training Data

In [39]:
data = document_vectors
data.shape

(10000, 512)

#### Add human cluster docs to data 

In [147]:
for group in groups:

    # grab latest history item for each group
    group_document_ids = group["history"][0]["included_documents"]
    
    indices = []
    
    for id in group_document_ids:
        
        try:
            index = document_ids.index(id)
            indices.append(index)
        
        except:
            document = db.documents.find_one({"id": id}, projection=projection)
            document_ids.append(id)
            vector = np.array(document["textVector"]).reshape((1, 512))
            data = np.append(data, vector, axis=0)
            
            index = document_ids.index(id)
            indices.append(index)
            
    print("Document ids has the shape: ", len(document_ids))
    print("The data has shape: ", data.shape)
    
    print(indices)
        

Document ids has the shape:  10000
The data has shape:  (10000, 512)
[5630, 7789, 2801, 3965]
Document ids has the shape:  10005
The data has shape:  (10005, 512)
[10000, 10001, 10002, 10003, 6135, 9393, 10004]
Document ids has the shape:  10010
The data has shape:  (10010, 512)
[10005, 10006, 10007, 10008, 10009]


### Clustering

#### UMAP

Notes
- DR by default hyperparameters
- low_memory uses less memory but longer compute time
- verbose just logs info

TODO
- define custom metric
- what args are passed to custom metric
- how do we check if args are a subset of group?
- can we use conditional or do we need matrix

ISSUES
- general 
    - how to check if i and j are in the same group
    - i and j are vectors
    - is it possible to pass i and j as indices instead of vectors?
- conditional metric
    - groups contain ids
    - maybe search for indices of both id and vec then compare? seems expensive
    - maybe translate vec to id (or visversa) then compare? seems expensive
- matrix metric
    - need to create
    - how lookup in metric?
        - need to get index of i and j vectors

##### Custom Metric

In [149]:
# use dict or ndarray?
group_doc_ids = {}

for group in range(len(groups)):
    curr_id = groups[group]['history'][0]['included_documents']
    ids = []
    for i in curr_id:
        ids.append(document_ids.index(i))
    
    group_doc_ids[group] = ids
    
group_doc_ids

{0: [5630, 7789, 2801, 3965],
 1: [10000, 10001, 10002, 10003, 6135, 9393, 10004],
 2: [10005, 10006, 10007, 10008, 10009]}

##### Create Distance Matrix

In [167]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
size = len(document_vectors)
dist_mat = np.zeros((size,size))

for diag in range(size):
    for row in range(size-diag):  
        
        col = row + diag
        
        if col != row:
            
            vec_i = document_vectors[i]
            vec_j = document_vectors[j]
            dist = cosine_similarity([vec_i],[vec_j])[0][0]
            dist_mat[i,j] = dist
            dist_mat[j,i] = dist

dist_mat

In [234]:
size = 10
M = np.zeros((size,size))
M

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
size = len(M)
for diag in range(size):
    for row in range(size-diag):  
        
        col = row + diag
        
        if col != row:
            # M[col,row] = 1
            M[row,col] = 1
        

In [None]:
M[np.tril_indices(n, -1)] = M.T[np.tril_indices(n, -1)]
M

In [233]:
_R = np.random.uniform(-1,1,n*(n-1)/2)
P = np.zeros((n,n))
P[np.triu_indices(n, 1)] = _R
P[np.tril_indices(n, -1)] = P.T[np.tril_indices(n, -1)]

array([[0., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 1., 0., 0., 0.],
       [0., 0., 1., 1., 1., 1., 1., 1., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 0.]])

In [182]:
np.savez_compressed('distance_matrix', mat=distance_matrix)

##### Update Distance Matrix with group distances

In [170]:
for group in range(len(groups)):
    
    docs = groups[group]['history'][0]['included_documents']

    for i in range(len(docs)):
        
        index_i = document_ids.index(docs[i])
        
        for j in range(len(docs)):
            
            if (i != j):
            
                index_j = document_ids.index(docs[j])
                distance_matrix[index_i, index_j] = 0 
    

IndexError: index 10000 is out of bounds for axis 0 with size 10000

##### Reduction

In [156]:
fitter = umap.UMAP(metric='precomputed', verbose=True, low_memory=True).fit(distance_matrix)
embedding = fitter.embedding_
umap_embeddings = fitter.transform(distance_matrix)

UMAP(metric='precomputed', verbose=True)
Thu Jan 19 13:21:49 2023 Construct fuzzy simplicial set
Thu Jan 19 13:21:49 2023 Finding Nearest Neighbors
Thu Jan 19 13:21:49 2023 Finished Nearest Neighbor Search
Thu Jan 19 13:21:49 2023 Construct embedding


  warn("using precomputed metric; inverse_transform will be unavailable")


Epochs completed:   0%|            0/500 [00:00]

Thu Jan 19 13:21:52 2023 Finished embedding


#### HDBSCAN

Notes
- Clustering by default hyperparameters
- Resultant labels are in the same ordering as data

TODO
- User parameterize hyperparams
- Use custom metric hyperparam here too?

In [80]:
clusterer = hdbscan.HDBSCAN()
hdbscan_labels = clusterer.fit_predict(umap_embeddings)

In [81]:
label_array = np.array(hdbscan_labels)
clusters = {}

# iterate over given labels
for hdbscan_label in set(hdbscan_labels):
        
        # find indices of documents for a current label
        document_indices_scalar = np.where(label_array == hdbscan_label)[0]
        document_indices = [int(i) for i in document_indices_scalar]
        
        # create list of document ids that are in current label
        documents = []
        
        for i in document_indices:
            documents.append(document_ids[i])
        
        # add label and respective document ids to clusters dictionary
        clusters[hdbscan_label] = documents
        

In [82]:
clusters.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,

In [58]:
set(hdbscan_labels)

{-1, 0, 1, 2, 3}

### Toy Data Experiments

In [98]:
penguins = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv")
penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [99]:
# remove categorical features
penguin_data = penguins[
    [
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
].values

# scaling
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)

scaled_penguin_data

array([[-0.89604189,  0.7807321 , -1.42675157, -0.56847478],
       [-0.82278787,  0.11958397, -1.06947358, -0.50628618],
       [-0.67627982,  0.42472926, -0.42637319, -1.1903608 ],
       ...,
       [ 1.02687621,  0.52644436, -0.56928439, -0.53738048],
       [ 1.24663828,  0.93330475,  0.64546078, -0.13315457],
       [ 1.13675725,  0.7807321 , -0.2120064 , -0.53738048]])

In [100]:
group1 = scaled_penguin_data[0:2]
human_cluster

array([[-0.89604189,  0.7807321 , -1.42675157, -0.56847478],
       [-0.82278787,  0.11958397, -1.06947358, -0.50628618]])

In [25]:
type(scaled_penguin_data)
type(human_cluster)

numpy.ndarray

In [26]:
a = scaled_penguin_data[0]
b = scaled_penguin_data[1]
c = scaled_penguin_data[5]

type(a)

numpy.ndarray

In [None]:
# if the reduction works, the distance between index 0 and 1 should be 0 
a = embedding[0]
b = embedding[1]
np.dot(a,b)/(norm(a)*norm(b))