In [14]:
import random
from tqdm import tqdm
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import numpy as np
import pandas as pd
import csv
from sentence_transformers import SentenceTransformer
from sklearn.cluster import BisectingKMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
random_state = 42
#random_state = 420
random.seed(random_state)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/0.csv"
f_tokenized_other_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/11.csv"
f_tokenized_contrast_path = "/home/tfink/projects/rsa/kodicare/kodicare_framework/data/trec_covid_topic_modelling/abcnews-date-text.csv"

In [35]:
def read_cleaned(path):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        documents = {}
        document_uids = []
        for line in tqdm(reader):
            cord_uid, passage_text_cleaned = line
            if cord_uid not in documents:
                documents[cord_uid] = []
                document_uids.append(cord_uid)
            documents[cord_uid].append(passage_text_cleaned)
        return documents, document_uids
        

In [36]:
documents, document_uids = read_cleaned(f_tokenized_path)

1115954it [00:08, 129726.46it/s]


In [38]:
documents[document_uids[30]]

['glycyrrhiza glabra l]. glycyrrhiza glabra l].']

# Basic TF-IDF delta

In [5]:
def doc_iter(doc_dict, document_uids):
    for uid in document_uids:
        yield " ".join(doc_dict[uid])

count_vect = CountVectorizer(max_df=0.75)
X_train = count_vect.fit_transform(doc_iter(doc_dict=documents, document_uids=document_uids))
X_train.shape

(29487, 619934)

In [6]:
tf_transformer = TfidfTransformer().fit(X_train)
X_train = tf_transformer.transform(X_train)
X_train.shape

(29487, 619934)

In [19]:
top_ks = [1,3,10]

def batched_cosine_similarity(X,Y, batch_size=4096, dtype=np.float64):
    X_size = X.shape[0]
    Y_size = Y.shape[0]
    steps = int(np.ceil(Y_size / batch_size))
    sims = np.zeros((X_size, Y_size), dtype=dtype)
    for i in tqdm(range(steps)):
        start_idx = i*batch_size
        end_idx = start_idx + batch_size
        sim = cosine_similarity(X,Y[start_idx:end_idx,:]).astype(dtype=dtype)
        sims[:,start_idx:end_idx] = sim
    return sims

def batched_cosine_similarity_topks(X,Y, batch_size=4096, dtype=np.float64, top_ks=[1,3,10]):
    Y_size = Y.shape[0]
    steps = int(np.ceil(Y_size / batch_size))
    top_k_means = np.zeros((Y_size, len(top_ks)), dtype=dtype)
    for i in tqdm(range(steps)):
        start_idx = i*batch_size
        end_idx = start_idx + batch_size
        sim = cosine_similarity(X,Y[start_idx:end_idx,:]).astype(dtype=dtype)
        for top_k_idx in range(len(top_ks)):
            top_k = top_ks[top_k_idx]
            mean_sims = get_mean_similarities(sim, top_k=top_k)
            top_k_means[start_idx:end_idx,top_k_idx] = mean_sims
    return np.mean(top_k_means, axis=0)


def get_mean_similarities(sim_X_Y, top_k):
    # https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
    Y_size = sim_X_Y.shape[1]
    mean_sims = []
    for i in range(Y_size):
        #top_y_sims = sorted(sims[:,i], reverse=True)[:top_k]
        top_y_sims_idx = np.argpartition(sim_X_Y[:,i], -top_k)[-top_k:]
        top_y_sims = sim_X_Y[:,i][top_y_sims_idx]
        mean_sims.append(np.mean(top_y_sims))
    return np.array(mean_sims)
    

In [None]:
sim_means = batched_cosine_similarity_topks(X_train,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")
sim_means

## Delta

In [8]:
documents, document_uids = read_cleaned(f_tokenized_other_path)
X_test = count_vect.transform(doc_iter(doc_dict=documents, document_uids=document_uids))
X_test = tf_transformer.transform(X_test)
X_test.shape

1102137it [00:08, 130331.99it/s]


(29488, 619934)

In [9]:
sim_means = batched_cosine_similarity_topks(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 8/8 [02:49<00:00, 21.16s/it]


In [10]:
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")

Top1: 0.4280, Top3: 0.3877, Top10: 0.3325


## Contrast Dataset Delta

In [11]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    documents = []
    document_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        documents.append(passage_text_cleaned)
        document_uids.append(cord_uid)
print(documents[0])

1244184it [00:00, 1278293.43it/s]

aba decides against community broadcasting licence





In [12]:
X_test = count_vect.transform(documents)
X_test = tf_transformer.transform(X_test)
X_test.shape

(1244184, 619934)

In [13]:
sim_means = batched_cosine_similarity_topks(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 304/304 [39:28<00:00,  7.79s/it]


In [14]:
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")

Top1: 0.2435, Top3: 0.2009, Top10: 0.1461


# Topic Vocab based delta

In [5]:
def passage_iter(doc_dict, document_uids):
    for uid in document_uids:
        for passage in doc_dict[uid]:
            yield passage, uid

In [39]:
documents, document_uids = read_cleaned(f_tokenized_path)
passages, passage_uids = zip(*list(passage_iter(doc_dict=documents, document_uids=document_uids[:10000])))

1115954it [00:08, 135079.28it/s]


In [None]:
embedding_model = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model = SentenceTransformer(embedding_model)
passages_encoded = model.encode(passages, show_progress_bar=True, batch_size=32)

In [7]:
# dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=5, 
                       min_dist=0.0, 
                       metric='cosine', 
                       random_state=random_state, 
                       verbose=True)
embedding = umap_model.fit_transform(passages_encoded)

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)
Tue Aug 22 17:02:53 2023 Construct fuzzy simplicial set
Tue Aug 22 17:02:54 2023 Finding Nearest Neighbors
Tue Aug 22 17:02:54 2023 Building RP forest with 32 trees
Tue Aug 22 17:03:04 2023 NN descent for 18 iterations
	 1  /  18
	 2  /  18
	 3  /  18
	 4  /  18
	 5  /  18
	 6  /  18
	 7  /  18
	Stopping threshold met -- exiting after 7 iterations
Tue Aug 22 17:03:30 2023 Finished Nearest Neighbor Search
Tue Aug 22 17:03:34 2023 Construct embedding


Epochs completed: 100%| ██████████ 200/200 [01:35]


Tue Aug 22 17:05:58 2023 Finished embedding


In [8]:
n_clusters = int(np.sqrt(len(embedding)))

clustering_model = BisectingKMeans(n_clusters=n_clusters, 
                       n_init=1,
                       bisecting_strategy='largest_cluster',
                       random_state=random_state)
topics = clustering_model.fit_predict(embedding)

# clustering_model = GaussianMixture(n_components=n_clusters, 
#                        n_init=1,
#                        init_params='k-means++',
#                        covariance_type='diag',
#                        reg_covar=1e-3,
#                        random_state=random_state)
# topics = clustering_model.fit_predict(embedding)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
document_topics = {}

for topic, passage_uid in zip(topics, passage_uids):
    if passage_uid not in document_topics:
        document_topics[passage_uid] = []
    document_topics[passage_uid].append(str(topic))


for document_uid in document_uids[:10000]:
    document_topics[document_uid] = " ".join(document_topics[document_uid])

In [41]:
document_topics[document_uids[30]]

'523'

In [16]:
def doc_topic_iter(doc_topic_dict, document_uids):
    for uid in document_uids:
        yield doc_topic_dict[uid]


# CountVectorizer needs custom token_pattern to include 1-character tokens (clusters 0-9)
count_vect = CountVectorizer(max_df=1.0, token_pattern=r"(?u)\b\w+\b")
X_train = count_vect.fit_transform(doc_topic_iter(doc_topic_dict=document_topics, document_uids=document_uids[:10000]))
assert count_vect.vocabulary_.get('0') != None
X_train.shape

(10000, 537)

In [17]:
tf_transformer = TfidfTransformer().fit(X_train)
X_train = tf_transformer.transform(X_train)
X_train.shape

(10000, 537)

In [20]:
sim_means = batched_cosine_similarity_topks(X_train,X_train, batch_size=4096, dtype=np.float16, top_ks=top_ks)
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")
sim_means

100%|██████████| 3/3 [00:05<00:00,  1.85s/it]

Top1: 1.0000, Top3: 0.9224, Top10: 0.8511





array([1.    , 0.9224, 0.851 ], dtype=float16)

## Delta

In [21]:
passages, passage_uids = zip(*list(passage_iter(doc_dict=documents, document_uids=document_uids[10000:20000])))
other_p = model.encode(passages, show_progress_bar=True, batch_size=32)
other_p = umap_model.transform(other_p)
other_topics = clustering_model.predict(other_p)

Batches: 100%|██████████| 13906/13906 [13:37<00:00, 17.01it/s]


Tue Aug 22 17:23:55 2023 Worst tree score: 0.71120688
Tue Aug 22 17:23:55 2023 Mean tree score: 0.71581295
Tue Aug 22 17:23:55 2023 Best tree score: 0.71925507
Tue Aug 22 17:23:58 2023 Forward diversification reduced edges from 4333275 to 1746936
Tue Aug 22 17:24:02 2023 Reverse diversification reduced edges from 1746936 to 1746936
Tue Aug 22 17:24:04 2023 Degree pruning reduced edges from 2231508 to 2215553
Tue Aug 22 17:24:04 2023 Resorting data and graph based on tree order
Tue Aug 22 17:24:04 2023 Building and compiling search function


Epochs completed: 100%| ██████████ 30/30 [00:15]


In [22]:
document_topics = {}

for topic, passage_uid in zip(other_topics, passage_uids):
    if passage_uid not in document_topics:
        document_topics[passage_uid] = []
    document_topics[passage_uid].append(str(topic))


for document_uid in document_uids[10000:20000]:
    document_topics[document_uid] = " ".join(document_topics[document_uid])

X_test = count_vect.transform(doc_topic_iter(doc_topic_dict=document_topics, document_uids=document_uids[10000:20000]))
X_test = tf_transformer.transform(X_test)
X_test.shape

(10000, 537)

In [23]:
sim_means = batched_cosine_similarity_topks(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 3/3 [00:05<00:00,  1.86s/it]


In [24]:
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")

Top1: 0.8242, Top3: 0.8032, Top10: 0.7549


## Contrast Dataset Delta

In [25]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    documents = []
    document_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        documents.append(passage_text_cleaned)
        document_uids.append(cord_uid)
print(documents[0])

1244184it [00:01, 1230402.38it/s]

aba decides against community broadcasting licence





In [26]:
other_p = model.encode(documents[:10000], show_progress_bar=True, batch_size=32)
other_p = umap_model.transform(other_p)
other_topics = clustering_model.predict(other_p)

Batches: 100%|██████████| 313/313 [00:02<00:00, 119.92it/s]
Epochs completed: 100%| ██████████ 100/100 [00:02]


In [27]:
other_topics = [str(topic) for topic in other_topics]

X_test = count_vect.transform(other_topics)
X_test = tf_transformer.transform(X_test)
X_test.shape

(10000, 537)

In [44]:
sim_means = batched_cosine_similarity_topks(X_train,X_test, batch_size=4096, dtype=np.float16, top_ks=top_ks)

100%|██████████| 3/3 [00:04<00:00,  1.60s/it]


In [45]:
top_1, top_3, top_10 = sim_means
print(f"Top1: {top_1:.4f}, Top3: {top_3:.4f}, Top10: {top_10:.4f}")

Top1: 0.9883, Top3: 0.9707, Top10: 0.8813
