In [1]:
from bertopic import BERTopic
import glob
import time
import random
import re
from tqdm import tqdm
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
import csv
random_state = 42
random.seed(random_state)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/0.txt"
f_tokenized_other_path = "/home/tfink/data/kodicare/trec-covid/dtc_evolving_bert/11.txt"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
def read_cleaned(path):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        passages = []
        passage_uids = []
        for line in tqdm(reader):
            cord_uid, passage_text_cleaned = line
            passages.append(passage_text_cleaned)
            passage_uids.append(cord_uid)
        return passages, passage_uids
        

In [3]:
passages, passage_uids = read_cleaned(f_tokenized_path)

0it [00:00, ?it/s]

598761it [00:08, 66676.03it/s]


In [4]:
passages[50], passage_uids[50]

('es karm jedoch nicht aufgabe dieses bersiehtsberichtes sein, auf die ein lheiten der konservativen behandlung der niereninsuifizienz einzugehen. dafiir sei ant die ausgezeichneten i)arstellungen der klinischen behandlung akuter und chronischer ix iereninsuffizicnz yon merr l a , elki on -i)a owsk 1 oder strauss-ra sz s hingewiesen. es ist uns ein besonderes anliegen, hier nochmals die wichtigkeit sorgfiiltiger krankenpflege in den tagen und wochen nach der dialysebehandlung zu betonen. fortschreitende besserung und die endgiiltige wiederherstellung hanger in betrachtlichem mabe davon ab. axzte und pflegepersonal, die patienten mit akuter oder chronischer niereninsuffizienz betreut babel, wissen, da2 diese besonders empfanglich fiir bestimmte komplikationen sind. lungenatelektasen, lastige mundhhhlenentziindungen oder die mhglichkeit yon decubitusgeschwtiren seien nur als beispiele erwi hnt. es ist deshalb nnerlal lich z.b. dutch haufiges wenden des kranken, stiindliche atemiibungen, 

In [5]:
min_topic_size = 5
min_samples = None
cluster_selection_epsilon = 0.25
# dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=5, 
                       min_dist=0.0, 
                       metric='cosine', 
                       random_state=random_state, 
                       verbose=True)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_topic_size,
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True,
                        cluster_selection_epsilon=cluster_selection_epsilon,
                        min_samples=min_samples)
#hdbscan_model=None

# vectorizer
vectorizer = CountVectorizer(stop_words='english')

# create BerTopic model
#embedding_model = "xlm-r-bert-base-nli-stsb-mean-tokens"
embedding_model = "all-mpnet-base-v2"
topic_model = BERTopic(embedding_model=embedding_model, 
                       umap_model=umap_model, 
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer,
                       language="english", 
                       calculate_probabilities=False, 
                       min_topic_size=min_topic_size,
                       verbose=True)

# Perform topic modeling with BERTopic
topics, probabilities = topic_model.fit_transform(passages[:20000])

Batches: 100%|██████████| 625/625 [06:33<00:00,  1.59it/s]
2023-07-12 17:44:35,716 - BERTopic - Transformed documents to Embeddings


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, random_state=42, verbose=True)
Wed Jul 12 17:44:35 2023 Construct fuzzy simplicial set
Wed Jul 12 17:44:35 2023 Finding Nearest Neighbors
Wed Jul 12 17:44:35 2023 Building RP forest with 12 trees
Wed Jul 12 17:44:39 2023 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	Stopping threshold met -- exiting after 5 iterations
Wed Jul 12 17:44:53 2023 Finished Nearest Neighbor Search
Wed Jul 12 17:44:56 2023 Construct embedding


Epochs completed: 100%| ██████████ 200/200 [00:06]
2023-07-12 17:45:03,961 - BERTopic - Reduced dimensionality


Wed Jul 12 17:45:03 2023 Finished embedding
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disab

2023-07-12 17:45:05,212 - BERTopic - Clustered reduced embeddings


In [6]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,342,-1_cells_ml_virus_10
1,0,5126,0_rna_mhv_jhm_et
2,1,1922,1_rotavirus_calves_diarrhea_bovine
3,2,618,2_229e_cf_oc43_rises
4,3,583,3_influenza_interferon_volunteers_antiviral
...,...,...,...
228,227,6,227_fipv_renografin_hyperimmune_79
229,228,6,228_lipoproteins_apoc_apoa_vldls
230,229,6,229_der_capi_zellen_die
231,230,6,230_prcv_sdav_glands_lacrimal


In [7]:
other_topics, _ = topic_model.transform(passages[20000:30000])

Batches: 100%|██████████| 313/313 [03:21<00:00,  1.55it/s]


Wed Jul 12 17:48:34 2023 Worst tree score: 0.72090000
Wed Jul 12 17:48:34 2023 Mean tree score: 0.73215833
Wed Jul 12 17:48:34 2023 Best tree score: 0.74185000
Wed Jul 12 17:48:36 2023 Forward diversification reduced edges from 300000 to 100829
Wed Jul 12 17:48:39 2023 Reverse diversification reduced edges from 100829 to 100829
Wed Jul 12 17:48:42 2023 Degree pruning reduced edges from 110202 to 110172
Wed Jul 12 17:48:42 2023 Resorting data and graph based on tree order
Wed Jul 12 17:48:42 2023 Building and compiling search function


Epochs completed: 100%| ██████████ 100/100 [00:01]
2023-07-12 17:48:57,669 - BERTopic - Reduced dimensionality
2023-07-12 17:48:58,144 - BERTopic - Predicted clusters


In [8]:
def get_topic_proportions(base_topics, other_topics):
    # first calculate base topic counts and proportions
    base_outlier_count = 0
    base_topic_counts = {}
    base_topic_proportions = {}
    non_outlier_docs = 0
    for idx, topic in enumerate(base_topics):
        if topic == -1:
            base_outlier_count += 1
            continue
        non_outlier_docs += 1
        topic_desc = topic_model.get_topic(topic)
        topic_desc = f"{topic}_{'_'.join([t for t,p in topic_desc[:5]])}"
        if topic_desc not in base_topic_counts:
            base_topic_counts[topic_desc] = 0
        base_topic_counts[topic_desc] += 1
    for topic_desc in base_topic_counts.keys():
        base_topic_proportions[topic_desc] = base_topic_counts[topic_desc] / non_outlier_docs
    base_outlier_proportion = base_outlier_count / len(base_topics)

    # then calculate other topic counts and proportions
    other_outlier_count = 0
    other_topic_counts = {t:0 for t in base_topic_counts.keys()}
    other_topic_proportions = {}
    non_outlier_docs = 0
    for idx, topic in enumerate(other_topics):
        if topic == -1:
            other_outlier_count += 1
            continue
        non_outlier_docs += 1
        topic_desc = topic_model.get_topic(topic)
        topic_desc = f"{topic}_{'_'.join([t for t,p in topic_desc[:5]])}"
        other_topic_counts[topic_desc] += 1
    for topic_desc in other_topic_counts.keys():
        other_topic_proportions[topic_desc] = other_topic_counts[topic_desc] / non_outlier_docs
    other_outlier_proportion = other_outlier_count / len(other_topics)

    # calc
    intersection = get_intersection(base_topic_proportions, other_topic_proportions)
    print(f"base_outliers: {base_outlier_proportion:.2%}, other_outliers: {other_outlier_proportion:.2%}, intersection: {intersection:.2%}")


def get_intersection(topic_proportions_a, topic_proportions_b):
    total = 0
    for topic_desc in topic_proportions_a.keys():
        total += min(topic_proportions_a[topic_desc], topic_proportions_b[topic_desc])
    return total

In [9]:
get_topic_proportions(topics, other_topics)

base_outliers: 1.71%, other_outliers: 82.36%, intersection: 22.94%
