In [1]:
import random
from tqdm import tqdm
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import numpy as np
import pandas as pd
import csv
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, BisectingKMeans
from sklearn.mixture import GaussianMixture
random_state = 42
#random_state = 420
random.seed(random_state)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/0.csv"
f_tokenized_other_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/11.csv"
f_tokenized_contrast_path = "/home/tfink/projects/rsa/kodicare/kodicare_framework/data/trec_covid_topic_modelling/abcnews-date-text.csv"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
def read_cleaned(path):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        passages = []
        passage_uids = []
        for line in tqdm(reader):
            cord_uid, passage_text_cleaned = line
            passages.append(passage_text_cleaned)
            passage_uids.append(cord_uid)
        return passages, passage_uids
        

In [3]:
passages, passage_uids = read_cleaned(f_tokenized_path)

1115954it [00:08, 125403.63it/s]


In [4]:
passages[50], passage_uids[50]

('in conjunction with the virus this organism produced a vigorous leukocytic reaction.',
 '04ceiyko')

In [5]:
embedding_model = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model = SentenceTransformer(embedding_model)
passages_encoded = model.encode(passages[:100000], show_progress_bar=True, batch_size=32)

Batches: 100%|██████████| 3125/3125 [03:03<00:00, 17.03it/s]


In [69]:
# dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=5, 
                       min_dist=0.0, 
                       metric='cosine', 
                       random_state=random_state, 
                       verbose=True)
embedding = umap_model.fit_transform(passages_encoded)

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)
Wed Aug  2 17:32:39 2023 Construct fuzzy simplicial set
Wed Aug  2 17:32:39 2023 Finding Nearest Neighbors
Wed Aug  2 17:32:39 2023 Building RP forest with 21 trees
Wed Aug  2 17:32:41 2023 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	 6  /  17
	 7  /  17
	Stopping threshold met -- exiting after 7 iterations
Wed Aug  2 17:32:44 2023 Finished Nearest Neighbor Search
Wed Aug  2 17:32:44 2023 Construct embedding


Epochs completed: 100%| ██████████ 200/200 [00:32]


Wed Aug  2 17:33:25 2023 Finished embedding


In [70]:
n_clusters = int(np.sqrt(len(embedding)))
n_clusters = 350

clustering_model = BisectingKMeans(n_clusters=n_clusters, 
                       n_init=1,
                       bisecting_strategy='largest_cluster',
                       random_state=random_state)
topics = clustering_model.fit_predict(embedding)

# clustering_model = GaussianMixture(n_components=n_clusters, 
#                        n_init=1,
#                        init_params='k-means++',
#                        covariance_type='diag',
#                        reg_covar=1e-3,
#                        random_state=random_state)
# topics = clustering_model.fit_predict(embedding)

In [71]:
from scipy.special import softmax

print(passages[50:51])
print(clustering_model.predict(embedding[50:51]))
t = clustering_model.transform(embedding[50:51])
ts = softmax(np.exp(-t), axis=-1)
print(ts[:,np.argmax(ts)])
print(ts[:,0])

['in conjunction with the virus this organism produced a vigorous leukocytic reaction.']
[313]
[0.00530461]
[0.00271237]


In [63]:
other_p = model.encode(passages[100000:200000], show_progress_bar=True, batch_size=32)
other_p = umap_model.transform(other_p)
other_topics = clustering_model.predict(other_p)

Batches: 100%|██████████| 3125/3125 [03:03<00:00, 16.99it/s]


Wed Aug  2 17:17:40 2023 Worst tree score: 0.71285000
Wed Aug  2 17:17:40 2023 Mean tree score: 0.71946571
Wed Aug  2 17:17:40 2023 Best tree score: 0.72525000
Wed Aug  2 17:17:42 2023 Forward diversification reduced edges from 1500000 to 610481
Wed Aug  2 17:17:46 2023 Reverse diversification reduced edges from 610481 to 610481
Wed Aug  2 17:17:48 2023 Degree pruning reduced edges from 774636 to 770039
Wed Aug  2 17:17:48 2023 Resorting data and graph based on tree order
Wed Aug  2 17:17:48 2023 Building and compiling search function


Epochs completed: 100%| ██████████ 30/30 [00:03]


In [64]:
def get_topic_proportions(topics):
    # first calculate base topic counts and proportions
    outlier_count = 0
    topic_counts = {topic:0 for topic in range(n_clusters)}
    topic_proportions = {}
    non_outlier_docs = 0
    for idx, topic in enumerate(topics):
        if topic == -1:
            outlier_count += 1
            continue
        non_outlier_docs += 1
        topic_counts[topic] += 1
    for topic in range(n_clusters):
        topic_proportions[topic] = topic_counts[topic] / non_outlier_docs
    outlier_proportion = outlier_count / len(topics)
    return topic_proportions, outlier_proportion


def get_intersection(topic_proportions_a, topic_proportions_b):
    total = 0
    for topic in range(n_clusters):
        total += min(topic_proportions_a[topic], topic_proportions_b[topic])
    return total


def calculate_topic_intersection(base_topics, other_topics):
    base_topic_proportions, base_outlier_proportion = get_topic_proportions(base_topics)
    other_topic_proportions, other_outlier_proportion = get_topic_proportions(other_topics)

    # calc
    intersection = get_intersection(base_topic_proportions, other_topic_proportions)
    print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}")

In [65]:
base_topic_proportions, base_outlier_proportion = get_topic_proportions(topics)
other_topic_proportions, other_outlier_proportion = get_topic_proportions(other_topics)

# calc
intersection = get_intersection(base_topic_proportions, other_topic_proportions)
print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}")

base_outliers: 0.00%, other_outliers: 0.00%, intersection: 72.60%


In [None]:
df = pd.DataFrame({
    "Topics": ["outliers"] + [str(topic) for topic in range(n_clusters)],
    'Base': [base_outlier_proportion]+[base_topic_proportions[topic] for topic in range(n_clusters)],
    'Other': [other_outlier_proportion]+[other_topic_proportions[topic] for topic in range(n_clusters)],
})

fig = px.bar(
    data_frame = df,
    x = "Topics",
    y = ["Base","Other"],
    opacity = 0.9,
    orientation = "v",
    barmode = 'group',
    title='Topic Proportions',
    color_discrete_sequence=px.colors.qualitative.D3
    #color_discrete_sequence=px.colors.sequential.Inferno_r
)
fig.update_yaxes(range=[0.0, 0.035])
fig.write_html("trec_covid_small.html")

In [None]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    passages_contrast = []
    passage_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        passages_contrast.append(passage_text_cleaned)
        passage_uids.append(cord_uid)
print(passages_contrast[0])

0it [00:00, ?it/s]

1244184it [00:00, 1279913.39it/s]

aba decides against community broadcasting licence





In [None]:
contrast_topics, _ = topic_model.transform(passages_contrast[:100000])

Batches: 100%|██████████| 3125/3125 [00:24<00:00, 127.77it/s]
Epochs completed: 100%| ██████████ 30/30 [00:05]
2023-07-18 15:31:03,554 - BERTopic - Reduced dimensionality
2023-07-18 15:31:03,631 - BERTopic - Predicted clusters


In [None]:
contrast_topic_proportions, contrast_outlier_proportion = get_topic_proportions(contrast_topics)

# calc
intersection = get_intersection(base_topic_proportions, contrast_topic_proportions)
mae = get_MAE(base_topic_proportions, contrast_topic_proportions)
ae = get_absolute_error(base_topic_proportions, contrast_topic_proportions)
print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}, MAE: {mae:.4f}, AE: {ae:.4f}")

base_outliers: 0.00%, other_outliers: 0.00%, intersection: 57.84%, MAE: 0.0024, AE: 0.8432


In [None]:
df = pd.DataFrame({
    "Topics": ["outliers"] + [str(topic) for topic in range(len(topic2desc))],
    "Topics_Names": ["outliers"] + [topic2desc[topic] for topic in range(len(topic2desc))],
    'Base': [base_outlier_proportion]+[base_topic_proportions[topic] for topic in range(len(topic2desc))],
    'Other': [contrast_outlier_proportion]+[contrast_topic_proportions[topic] for topic in range(len(topic2desc))],
})

fig = px.bar(
    data_frame = df,
    x = "Topics",
    y = ["Base","Other"],
    opacity = 0.9,
    orientation = "v",
    barmode = 'group',
    title='Topic Proportions',
    hover_data=["Topics_Names"],
    color_discrete_sequence=px.colors.qualitative.D3
    #color_discrete_sequence=px.colors.sequential.Inferno_r
)
fig.update_yaxes(range=[0.0, 0.035])
fig.write_html("trec_covid_small_contrast.html")