In [1]:
import random
from tqdm import tqdm
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import numpy as np
import pandas as pd
import csv
from sentence_transformers import SentenceTransformer
from sklearn.cluster import BisectingKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
random_state = 42
#random_state = 420
random.seed(random_state)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/0.csv"
f_tokenized_other_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/11.csv"
f_tokenized_contrast_path = "/home/tfink/projects/rsa/kodicare/kodicare_framework/data/trec_covid_topic_modelling/abcnews-date-text.csv"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
def read_cleaned(path):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        documents = {}
        document_uids = []
        for line in tqdm(reader):
            cord_uid, passage_text_cleaned = line
            if cord_uid not in documents:
                documents[cord_uid] = []
                document_uids.append(cord_uid)
            documents[cord_uid].append(passage_text_cleaned)
        return documents, document_uids
        

In [29]:
documents, document_uids = read_cleaned(f_tokenized_path)

1115954it [00:08, 131887.62it/s]


In [30]:
documents[document_uids[30]]

['glycyrrhiza glabra l]. glycyrrhiza glabra l].']

# Basic TF-IDF delta

In [31]:
def doc_iter(doc_dict, document_uids):
    for uid in document_uids:
        yield " ".join(doc_dict[uid])

count_vect = CountVectorizer(max_df=0.75)
X_train = count_vect.fit_transform(doc_iter(doc_dict=documents, document_uids=document_uids))
X_train.shape

(29487, 619934)

In [32]:
tf_transformer = TfidfTransformer().fit(X_train)
X_train = tf_transformer.transform(X_train)
X_train.shape

(29487, 619934)

In [3]:
top_ks = [1,10,100]

def batched_cosine_similarity(X,Y, batch_size=4096, dtype=np.float64):
    X_size = X.shape[0]
    Y_size = Y.shape[0]
    steps = int(np.ceil(Y_size / batch_size))
    sims = np.zeros((X_size, Y_size), dtype=dtype)
    for i in tqdm(range(steps)):
        start_idx = i*batch_size
        end_idx = start_idx + batch_size
        sim = cosine_similarity(X,Y[start_idx:end_idx,:]).astype(dtype=dtype)
        sims[:,start_idx:end_idx] = sim
    return sims

def batched_cosine_similarity_topks_Y2X(X,Y, batch_size=4096, dtype=np.float64, top_ks=[1,3,10]):
    # compute cosine similiarities between each vector in Y (batched) with each vector in X
    # for each vector y in Y, get the top k most similar vectors in X, and compute the mean over them
    Y_size = Y.shape[0]
    steps = int(np.ceil(Y_size / batch_size))
    top_k_means = np.zeros((Y_size, len(top_ks)), dtype=dtype)
    for i in tqdm(range(steps)):
        start_idx = i*batch_size
        end_idx = start_idx + batch_size
        sim = cosine_similarity(X,Y[start_idx:end_idx,:]).astype(dtype=dtype)
        for top_k_idx in range(len(top_ks)):
            top_k = top_ks[top_k_idx]
            mean_sims = get_mean_similarities(sim, top_k=top_k)
            top_k_means[start_idx:end_idx,top_k_idx] = mean_sims
    return np.mean(top_k_means, axis=0)


def get_mean_similarities(sim_X_Y, top_k):
    # https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
    Y_size = sim_X_Y.shape[1]
    mean_sims = []
    for i in range(Y_size):
        #top_y_sims = sorted(sims[:,i], reverse=True)[:top_k]
        top_y_sims_idx = np.argpartition(sim_X_Y[:,i], -top_k)[-top_k:]
        top_y_sims = sim_X_Y[:,i][top_y_sims_idx]
        mean_sims.append(np.mean(top_y_sims))
    return np.array(mean_sims)
    

In [34]:
sim_means = batched_cosine_similarity_topks_Y2X(X_train,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)
print(f"Top{top_ks[0]}: {sim_means[0]:.4f}, Top{top_ks[1]}: {sim_means[1]:.4f}, Top{top_ks[2]}: {sim_means[2]:.4f}")
sim_means

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [02:49<00:00, 21.18s/it]

Top1: 1.0000, Top10: 0.4514, Top100: 0.2683





array([1.    , 0.4514, 0.2683], dtype=float16)

## Delta

In [35]:
documents, document_uids = read_cleaned(f_tokenized_other_path)
X_test = count_vect.transform(doc_iter(doc_dict=documents, document_uids=document_uids))
X_test = tf_transformer.transform(X_test)
X_test.shape

1102137it [00:08, 126462.32it/s]


(29488, 619934)

In [36]:
sim_means_test2train = batched_cosine_similarity_topks_Y2X(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)
sim_means_train2test = batched_cosine_similarity_topks_Y2X(X_test,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [02:45<00:00, 20.67s/it]
100%|██████████| 8/8 [02:44<00:00, 20.52s/it]


In [37]:
print(f"Test2Train Top{top_ks[0]}: {sim_means_test2train[0]:.4f}, Top{top_ks[1]}: {sim_means_test2train[1]:.4f}, Top{top_ks[2]}: {sim_means_test2train[2]:.4f}")
print(f"Train2Test Top{top_ks[0]}: {sim_means_train2test[0]:.4f}, Top{top_ks[1]}: {sim_means_train2test[1]:.4f}, Top{top_ks[2]}: {sim_means_train2test[2]:.4f}")

Test2Train Top1: 0.4280, Top10: 0.3325, Top100: 0.2223
Train2Test Top1: 0.4336, Top10: 0.3408, Top100: 0.2312


## Contrast Dataset Delta

In [13]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    documents = []
    document_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        documents.append(passage_text_cleaned)
        document_uids.append(cord_uid)
print(documents[0])

1244184it [00:01, 1191319.82it/s]

aba decides against community broadcasting licence





In [14]:
X_test = count_vect.transform(documents)
X_test = tf_transformer.transform(X_test)
X_test.shape

(1244184, 619934)

In [15]:
sim_means_test2train = batched_cosine_similarity_topks_Y2X(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)
sim_means_train2test = batched_cosine_similarity_topks_Y2X(X_test,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 304/304 [40:11<00:00,  7.93s/it]


In [16]:
print(f"Test2Train Top{top_ks[0]}: {sim_means_test2train[0]:.4f}, Top{top_ks[1]}: {sim_means_test2train[1]:.4f}, Top{top_ks[2]}: {sim_means_test2train[2]:.4f}")
print(f"Train2Test Top{top_ks[0]}: {sim_means_train2test[0]:.4f}, Top{top_ks[1]}: {sim_means_train2test[1]:.4f}, Top{top_ks[2]}: {sim_means_train2test[2]:.4f}")

Top1: 0.2435, Top10: 0.1461, Top100: 0.0609


# Topic Vocab based delta

In [4]:
def passage_iter(doc_dict, document_uids):
    for uid in document_uids:
        for passage in doc_dict[uid]:
            yield passage, uid

In [5]:
documents, document_uids = read_cleaned(f_tokenized_path)
passages, passage_uids = zip(*list(passage_iter(doc_dict=documents, document_uids=document_uids[:10000])))

1115954it [00:08, 128269.29it/s]


In [6]:
embedding_model = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model = SentenceTransformer(embedding_model)
passages_encoded = model.encode(passages, show_progress_bar=True, batch_size=32)

Batches: 100%|██████████| 9028/9028 [08:43<00:00, 17.24it/s]


In [7]:
# dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=5, 
                       min_dist=0.0, 
                       metric='cosine', 
                       random_state=random_state, 
                       verbose=True)
embedding = umap_model.fit_transform(passages_encoded)

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)
Wed Aug 23 12:22:07 2023 Construct fuzzy simplicial set
Wed Aug 23 12:22:07 2023 Finding Nearest Neighbors
Wed Aug 23 12:22:07 2023 Building RP forest with 32 trees


Wed Aug 23 12:22:18 2023 NN descent for 18 iterations
	 1  /  18
	 2  /  18
	 3  /  18
	 4  /  18
	 5  /  18
	 6  /  18
	 7  /  18
	Stopping threshold met -- exiting after 7 iterations
Wed Aug 23 12:22:42 2023 Finished Nearest Neighbor Search
Wed Aug 23 12:22:45 2023 Construct embedding


Epochs completed: 100%| ██████████ 200/200 [01:32]


Wed Aug 23 12:25:01 2023 Finished embedding


In [8]:
n_clusters = int(np.sqrt(len(embedding)))

clustering_model = BisectingKMeans(n_clusters=n_clusters, 
                       n_init=1,
                       bisecting_strategy='largest_cluster',
                       random_state=random_state)
topics = clustering_model.fit_predict(embedding)

# clustering_model = GaussianMixture(n_components=n_clusters, 
#                        n_init=1,
#                        init_params='k-means++',
#                        covariance_type='diag',
#                        reg_covar=1e-3,
#                        random_state=random_state)
# topics = clustering_model.fit_predict(embedding)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
document_topics = {}

for topic, passage_uid in zip(topics, passage_uids):
    if passage_uid not in document_topics:
        document_topics[passage_uid] = []
    document_topics[passage_uid].append(str(topic))


for document_uid in document_uids[:10000]:
    document_topics[document_uid] = " ".join(document_topics[document_uid])

In [10]:
document_topics[document_uids[30]]

'523'

In [11]:
def doc_topic_iter(doc_topic_dict, document_uids):
    for uid in document_uids:
        yield doc_topic_dict[uid]


# CountVectorizer needs custom token_pattern to include 1-character tokens (clusters 0-9)
count_vect = CountVectorizer(max_df=1.0, token_pattern=r"(?u)\b\w+\b")
X_train = count_vect.fit_transform(doc_topic_iter(doc_topic_dict=document_topics, document_uids=document_uids[:10000]))
assert count_vect.vocabulary_.get('0') != None
X_train.shape

(10000, 537)

In [12]:
tf_transformer = TfidfTransformer().fit(X_train)
X_train = tf_transformer.transform(X_train)
X_train.shape

(10000, 537)

In [13]:
sim_means = batched_cosine_similarity_topks_Y2X(X_train,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)
print(f"Top{top_ks[0]}: {sim_means[0]:.4f}, Top{top_ks[1]}: {sim_means[1]:.4f}, Top{top_ks[2]}: {sim_means[2]:.4f}")
sim_means

100%|██████████| 3/3 [00:04<00:00,  1.65s/it]

Top1: 1.0000, Top10: 0.8511, Top100: 0.4995





array([1.    , 0.851 , 0.4995], dtype=float16)

## Delta

In [14]:
passages, passage_uids = zip(*list(passage_iter(doc_dict=documents, document_uids=document_uids[10000:20000])))
other_p = model.encode(passages, show_progress_bar=True, batch_size=32)
other_p = umap_model.transform(other_p)
other_topics = clustering_model.predict(other_p)

Batches:   1%|          | 104/13906 [00:06<13:37, 16.89it/s]

Batches: 100%|██████████| 13906/13906 [13:30<00:00, 17.15it/s]


Wed Aug 23 12:38:59 2023 Worst tree score: 0.71120688
Wed Aug 23 12:38:59 2023 Mean tree score: 0.71581295
Wed Aug 23 12:38:59 2023 Best tree score: 0.71925507
Wed Aug 23 12:39:02 2023 Forward diversification reduced edges from 4333275 to 1746936
Wed Aug 23 12:39:05 2023 Reverse diversification reduced edges from 1746936 to 1746936
Wed Aug 23 12:39:07 2023 Degree pruning reduced edges from 2231508 to 2215553
Wed Aug 23 12:39:07 2023 Resorting data and graph based on tree order
Wed Aug 23 12:39:07 2023 Building and compiling search function


Epochs completed: 100%| ██████████ 30/30 [00:13]


In [15]:
document_topics = {}

for topic, passage_uid in zip(other_topics, passage_uids):
    if passage_uid not in document_topics:
        document_topics[passage_uid] = []
    document_topics[passage_uid].append(str(topic))


for document_uid in document_uids[10000:20000]:
    document_topics[document_uid] = " ".join(document_topics[document_uid])

X_test = count_vect.transform(doc_topic_iter(doc_topic_dict=document_topics, document_uids=document_uids[10000:20000]))
X_test = tf_transformer.transform(X_test)
X_test.shape

(10000, 537)

In [16]:
sim_means_test2train = batched_cosine_similarity_topks_Y2X(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)
sim_means_train2test = batched_cosine_similarity_topks_Y2X(X_test,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 3/3 [00:05<00:00,  1.76s/it]
100%|██████████| 3/3 [00:05<00:00,  1.74s/it]


In [17]:
print(f"Test2Train Top{top_ks[0]}: {sim_means_test2train[0]:.4f}, Top{top_ks[1]}: {sim_means_test2train[1]:.4f}, Top{top_ks[2]}: {sim_means_test2train[2]:.4f}")
print(f"Train2Test Top{top_ks[0]}: {sim_means_train2test[0]:.4f}, Top{top_ks[1]}: {sim_means_train2test[1]:.4f}, Top{top_ks[2]}: {sim_means_train2test[2]:.4f}")

Test2Train Top1: 0.8242, Top10: 0.7549, Top100: 0.4419
Train2Test Top1: 0.8662, Top10: 0.7666, Top100: 0.4436


## Contrast Dataset Delta

In [18]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    documents = []
    document_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        documents.append(passage_text_cleaned)
        document_uids.append(cord_uid)
print(documents[0])

1244184it [00:01, 1242874.25it/s]

aba decides against community broadcasting licence





In [19]:
other_p = model.encode(documents[:10000], show_progress_bar=True, batch_size=32)
other_p = umap_model.transform(other_p)
other_topics = clustering_model.predict(other_p)

Batches: 100%|██████████| 313/313 [00:02<00:00, 124.22it/s]


Epochs completed: 100%| ██████████ 100/100 [00:02]


In [20]:
other_topics = [str(topic) for topic in other_topics]

X_test = count_vect.transform(other_topics)
X_test = tf_transformer.transform(X_test)
X_test.shape

(10000, 537)

In [21]:
sim_means_test2train = batched_cosine_similarity_topks_Y2X(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)
sim_means_train2test = batched_cosine_similarity_topks_Y2X(X_test,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 3/3 [00:04<00:00,  1.63s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:04<00:00,  1.63s/it]


In [22]:
print(f"Test2Train Top{top_ks[0]}: {sim_means_test2train[0]:.4f}, Top{top_ks[1]}: {sim_means_test2train[1]:.4f}, Top{top_ks[2]}: {sim_means_test2train[2]:.4f}")
print(f"Train2Test Top{top_ks[0]}: {sim_means_train2test[0]:.4f}, Top{top_ks[1]}: {sim_means_train2test[1]:.4f}, Top{top_ks[2]}: {sim_means_train2test[2]:.4f}")

Test2Train Top1: 0.9883, Top10: 0.8813, Top100: 0.3257
Train2Test Top1: 0.8379, Top10: 0.6958, Top100: 0.2568
