In [1]:
from bertopic import BERTopic
import glob
import time
import random
import re
from tqdm import tqdm
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import numpy as np
import pandas as pd
import csv
from sklearn.cluster import KMeans, BisectingKMeans
random_state = 42
#random_state = 420
random.seed(random_state)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/0.csv"
f_tokenized_other_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/11.csv"
f_tokenized_contrast_path = "/home/tfink/projects/rsa/kodicare/kodicare_framework/data/trec_covid_topic_modelling/abcnews-date-text.csv"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
def read_cleaned(path):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        passages = []
        passage_uids = []
        for line in tqdm(reader):
            cord_uid, passage_text_cleaned = line
            passages.append(passage_text_cleaned)
            passage_uids.append(cord_uid)
        return passages, passage_uids
        

In [3]:
passages, passage_uids = read_cleaned(f_tokenized_path)

0it [00:00, ?it/s]

1115954it [00:08, 131104.60it/s]


In [4]:
passages[50], passage_uids[50]

('in conjunction with the virus this organism produced a vigorous leukocytic reaction.',
 '04ceiyko')

In [5]:
min_topic_size = 10
min_samples = None
#cluster_selection_epsilon = 0.25
cluster_selection_epsilon = 0.15
# dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=5, 
                       min_dist=0.0, 
                       metric='cosine', 
                       random_state=random_state, 
                       verbose=True)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_topic_size,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True,
                        cluster_selection_epsilon=cluster_selection_epsilon,
                        min_samples=min_samples)
n_clusters = np.sqrt(len(passages))
n_clusters = 350
hdbscan_model = BisectingKMeans(n_clusters=n_clusters, 
                       n_init=1,
                       bisecting_strategy='largest_cluster',
                       random_state=random_state)
# hdbscan_model = KMeans(n_clusters=n_clusters, 
#                        init='k-means++', 
#                        n_init='auto',
#                        random_state=random_state)
#hdbscan_model=None

# vectorizer
vectorizer = CountVectorizer(stop_words='english')

# create BerTopic model
#embedding_model = "xlm-r-bert-base-nli-stsb-mean-tokens"
#embedding_model = "all-mpnet-base-v2"
embedding_model = "paraphrase-multilingual-MiniLM-L12-v2"
topic_model = BERTopic(embedding_model=embedding_model, 
                       umap_model=umap_model, 
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer,
                       language="english", 
                       calculate_probabilities=False, 
                       min_topic_size=min_topic_size,
                       verbose=True)

# Perform topic modeling with BERTopic
topics, probabilities = topic_model.fit_transform(passages[:100000])

Batches: 100%|██████████| 3125/3125 [03:01<00:00, 17.21it/s]
2023-07-18 15:21:51,770 - BERTopic - Transformed documents to Embeddings


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)
Tue Jul 18 15:21:51 2023 Construct fuzzy simplicial set
Tue Jul 18 15:21:51 2023 Finding Nearest Neighbors
Tue Jul 18 15:21:51 2023 Building RP forest with 21 trees
Tue Jul 18 15:21:56 2023 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	 6  /  17
	 7  /  17
	Stopping threshold met -- exiting after 7 iterations
Tue Jul 18 15:22:17 2023 Finished Nearest Neighbor Search
Tue Jul 18 15:22:20 2023 Construct embedding


Epochs completed: 100%| ██████████ 200/200 [00:34]


Tue Jul 18 15:23:03 2023 Finished embedding


2023-07-18 15:23:03,485 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2023-07-18 15:23:04,208 - BERTopic - Clustered reduced embeddings


In [6]:
# Get the list of topics
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name
0,0,499,0_influenza_sialic_hemagglutinin_erythrocytes
1,1,496,1_flavivirus_viruses_genera_flaviviruses
2,2,493,2_pulmonary_lung_ventilation_inhalation
3,3,491,3_coronaviruses_coronavirus_229e_oc43
4,4,483,4_liver_hepatic_aaf_epoxide
...,...,...,...
345,345,20,345_server_file_embl_nucleotide
346,346,20,346_turner_decoction_peoples_sore
347,347,18,347_dhori_pbl_segment_influenza
348,348,17,348_thalidomide_gvhd_teratogenicity_immunosupp...


In [7]:
topic2desc = {}
for index, row in topic_info.iterrows():
    if row.Topic == -1:
        continue
    topic2desc[row.Topic] = row.Name

In [8]:
other_topics, _ = topic_model.transform(passages[100000:200000])

Batches: 100%|██████████| 3125/3125 [03:00<00:00, 17.28it/s]


Tue Jul 18 15:26:24 2023 Worst tree score: 0.71285000
Tue Jul 18 15:26:24 2023 Mean tree score: 0.71946571
Tue Jul 18 15:26:24 2023 Best tree score: 0.72525000
Tue Jul 18 15:26:26 2023 Forward diversification reduced edges from 1500000 to 610481
Tue Jul 18 15:26:29 2023 Reverse diversification reduced edges from 610481 to 610481
Tue Jul 18 15:26:31 2023 Degree pruning reduced edges from 774636 to 770039
Tue Jul 18 15:26:31 2023 Resorting data and graph based on tree order
Tue Jul 18 15:26:31 2023 Building and compiling search function


Epochs completed: 100%| ██████████ 30/30 [00:03]
2023-07-18 15:28:32,895 - BERTopic - Reduced dimensionality
2023-07-18 15:28:33,002 - BERTopic - Predicted clusters


In [9]:
def get_topic_proportions(topics):
    # first calculate base topic counts and proportions
    outlier_count = 0
    topic_counts = {topic:0 for topic in range(len(topic2desc))}
    topic_proportions = {}
    non_outlier_docs = 0
    for idx, topic in enumerate(topics):
        if topic == -1:
            outlier_count += 1
            continue
        non_outlier_docs += 1
        topic_counts[topic] += 1
    for topic in range(len(topic2desc)):
        topic_proportions[topic] = topic_counts[topic] / non_outlier_docs
    outlier_proportion = outlier_count / len(topics)
    return topic_proportions, outlier_proportion


def get_intersection(topic_proportions_a, topic_proportions_b):
    total = 0
    for topic_desc in topic_proportions_a.keys():
        total += min(topic_proportions_a[topic_desc], topic_proportions_b[topic_desc])
    return total


def get_MAE(topic_proportions_a, topic_proportions_b):
    total = 0
    for topic_desc in topic_proportions_a.keys():
        total += abs(topic_proportions_a[topic_desc] - topic_proportions_b[topic_desc])
    return total / len(topic_proportions_a)


def get_absolute_error(topic_proportions_a, topic_proportions_b):
    total = 0
    for topic_desc in topic_proportions_a.keys():
        total += abs(topic_proportions_a[topic_desc] - topic_proportions_b[topic_desc])
    return total


def calculate_topic_intersection(base_topics, other_topics):
    base_topic_proportions, base_outlier_proportion = get_topic_proportions(base_topics)
    other_topic_proportions, other_outlier_proportion = get_topic_proportions(other_topics)

    # calc
    intersection = get_intersection(base_topic_proportions, other_topic_proportions)
    print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}")

In [10]:
base_topic_proportions, base_outlier_proportion = get_topic_proportions(topics)
other_topic_proportions, other_outlier_proportion = get_topic_proportions(other_topics)

# calc
intersection = get_intersection(base_topic_proportions, other_topic_proportions)
mae = get_MAE(base_topic_proportions, other_topic_proportions)
ae = get_absolute_error(base_topic_proportions, other_topic_proportions)
print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}, MAE: {mae:.4f}, AE: {ae:.4f}")

base_outliers: 0.00%, other_outliers: 0.00%, intersection: 74.06%, MAE: 0.0015, AE: 0.5188


In [11]:
df = pd.DataFrame({
    "Topics": ["outliers"] + [str(topic) for topic in range(len(topic2desc))],
    "Topics_Names": ["outliers"] + [topic2desc[topic] for topic in range(len(topic2desc))],
    'Base': [base_outlier_proportion]+[base_topic_proportions[topic] for topic in range(len(topic2desc))],
    'Other': [other_outlier_proportion]+[other_topic_proportions[topic] for topic in range(len(topic2desc))],
})

fig = px.bar(
    data_frame = df,
    x = "Topics",
    y = ["Base","Other"],
    opacity = 0.9,
    orientation = "v",
    barmode = 'group',
    title='Topic Proportions',
    hover_data=["Topics_Names"],
    color_discrete_sequence=px.colors.qualitative.D3
    #color_discrete_sequence=px.colors.sequential.Inferno_r
)
fig.update_yaxes(range=[0.0, 0.035])
fig.write_html("trec_covid_small.html")

In [12]:
with open(f_tokenized_contrast_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    passages_contrast = []
    passage_uids = []
    # skip first line
    reader.__next__()
    for line in tqdm(reader):
        cord_uid, passage_text_cleaned = line
        passages_contrast.append(passage_text_cleaned)
        passage_uids.append(cord_uid)
print(passages_contrast[0])

0it [00:00, ?it/s]

1244184it [00:00, 1279913.39it/s]

aba decides against community broadcasting licence





In [13]:
contrast_topics, _ = topic_model.transform(passages_contrast[:100000])

Batches: 100%|██████████| 3125/3125 [00:24<00:00, 127.77it/s]
Epochs completed: 100%| ██████████ 30/30 [00:05]
2023-07-18 15:31:03,554 - BERTopic - Reduced dimensionality
2023-07-18 15:31:03,631 - BERTopic - Predicted clusters


In [14]:
contrast_topic_proportions, contrast_outlier_proportion = get_topic_proportions(contrast_topics)

# calc
intersection = get_intersection(base_topic_proportions, contrast_topic_proportions)
mae = get_MAE(base_topic_proportions, contrast_topic_proportions)
ae = get_absolute_error(base_topic_proportions, contrast_topic_proportions)
print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}, MAE: {mae:.4f}, AE: {ae:.4f}")

base_outliers: 0.00%, other_outliers: 0.00%, intersection: 57.84%, MAE: 0.0024, AE: 0.8432


In [15]:
df = pd.DataFrame({
    "Topics": ["outliers"] + [str(topic) for topic in range(len(topic2desc))],
    "Topics_Names": ["outliers"] + [topic2desc[topic] for topic in range(len(topic2desc))],
    'Base': [base_outlier_proportion]+[base_topic_proportions[topic] for topic in range(len(topic2desc))],
    'Other': [contrast_outlier_proportion]+[contrast_topic_proportions[topic] for topic in range(len(topic2desc))],
})

fig = px.bar(
    data_frame = df,
    x = "Topics",
    y = ["Base","Other"],
    opacity = 0.9,
    orientation = "v",
    barmode = 'group',
    title='Topic Proportions',
    hover_data=["Topics_Names"],
    color_discrete_sequence=px.colors.qualitative.D3
    #color_discrete_sequence=px.colors.sequential.Inferno_r
)
fig.update_yaxes(range=[0.0, 0.035])
fig.write_html("trec_covid_small_contrast.html")