# üß¨ Discovering Emerging Topics in Drug Discovery Research using BERTopic

## 1Ô∏è‚É£Import dependencies

In [1]:
import os
import re, spacy
import pandas as pd
import nbformat
import IPython
import json
import matplotlib.pyplot as plt
import random
import numpy as np
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

from wordcloud import WordCloud
from Bio import Entrez, Medline
from bertopic import BERTopic
from transformers import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

## 2Ô∏è‚É£ Data Collection
I used PubMed  to fetch abstracts on topics like 'drug discovery', 'AI drug design', and 'molecular docking'.
Store as `data/raw_publications.csv`. 

In [None]:
Entrez.email = "paulafredrick26@gmail.com"  # already set
# 1. Search PubMed
handle = Entrez.esearch(
    db="pubmed",
    term=query,
    retmax=max_results,
    sort="relevance"
)
search_results = Entrez.read(handle)
handle.close()

pmids = search_results["IdList"]
print("Number of PMIDs found:", len(pmids))

# 2. Fetch details
handle = Entrez.efetch(
    db="pubmed",
    id=",".join(pmids),
    rettype="medline",
    retmode="text"
)
records = list(Medline.parse(handle))
handle.close()

# 3. Build DataFrame
rows = []
for rec in records:
    abstract = rec.get("AB", "")
    if not abstract:
        continue
    rows.append({
        "pmid": rec.get("PMID", ""),
        "title": rec.get("TI", ""),
        "abstract": abstract,
        "journal": rec.get("JT", ""),
        "year": rec.get("DP", "")[:4]
    })

df = pd.DataFrame(rows)
print(df.shape)
df.head()
    
# 4. Save CSV
os.makedirs("data", exist_ok=True)
df.to_csv("data/raw_publications.csv", index=False)
print("Saved to data/raw_publications.csv")

Number of PMIDs found: 1000
(976, 5)
Saved to data/raw_publications.csv


## 3Ô∏è‚É£ Data Cleaning & Preprocessing

In [3]:
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = str(text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # remove punctuation, make lowercase
    text = re.sub(r"[^\w\s]", "", text.lower())
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return " ".join(tokens)

### Start from Here!!

In [7]:
df = pd.read_csv("data/raw_publications.csv")
df["clean_text"] = df["abstract"].apply(clean_text)
df.head()

Unnamed: 0,pmid,title,abstract,journal,year,clean_text
0,31487867,Molecular Docking: Shifting Paradigms in Drug ...,Molecular docking is an established in silico ...,International journal of molecular sciences,2019,molecular docking establish silico structureba...
1,26205061,Molecular docking and structure-based drug des...,Pharmaceutical research has successfully incor...,"Molecules (Basel, Switzerland)",2015,pharmaceutical research successfully incorpora...
2,38594926,The Art and Science of Molecular Docking.,Molecular docking has become an essential part...,Annual review of biochemistry,2024,molecular docking essential structural biologi...
3,34560276,Machine-learning methods for ligand-protein mo...,Artificial intelligence (AI) is often presente...,Drug discovery today,2022,artificial intelligence ai present new industr...
4,34147204,Use of molecular docking computational tools i...,Molecular docking has become an important comp...,Progress in medicinal chemistry,2021,molecular docking important component drug dis...


## 4Ô∏è‚É£ Embedding Generation

In [5]:
import torch
torch.manual_seed(SEED)
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(
    df['clean_text'], 
    show_progress_bar=True, 
    convert_to_numpy=True
)

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

In [6]:
# Saving the embeddings
np.save("data/embeddings.npy", embeddings)

In [5]:
# Load the embeddings
embeddings = np.load("data/embeddings.npy")

## 5Ô∏è‚É£ Topic Modelling with BERTopic

In [6]:
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=12, metric='euclidean', cluster_selection_method='eom', prediction_data=False)

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=False,   # keeps memory lower
    verbose=False)
topics, probs = topic_model.fit_transform(df['clean_text'], embeddings)
topic_model.get_topic_info().head(15)

KeyError: 'clean_text'

##### Topic evolution over time

In [27]:
topics_over_time = topic_model.topics_over_time(
    df["clean_text"],
    df["year"]
)
topics_over_time 

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"inhibitor, study, compound, covalent, design",14,2015
1,0,"docking, drug, discovery, screening, molecular",32,2015
2,1,"virus, dengue, hbv, pr, protease",3,2015
3,2,"tuberculosis, inhibitor, derivative, novel, oh...",4,2015
4,3,"buche, azaphenothiazine, ache, inhibitor, synt...",2,2015
...,...,...,...,...
106,2,"compound, tuberculosis, drug, tb, study",6,2025
107,3,"fisher, target, neurodegenerative, candidate, ...",2,2025
108,4,"aak, kinase, stk, inhibitor, compound",4,2025
109,5,"caffeic, phytoconstituent, ehretia, acid, energy",1,2025


## 6Ô∏è‚É£ Zero-Shot Topic Labeling for Enhanced Interpretability
To improve interpretability of the unsupervised BERTopic results, I applied a zero-shot classification model (BART-MNLI) to automatically assign semantic labels to the representative documents of each topic.

In [30]:
classifier = pipeline("zero-shot-classification",
                      model="valhalla/distilbart-mnli-12-1")  

candidate_labels = [
    "molecular docking and virtual screening",
    "antiviral drug discovery (protease inhibitors)",
    "cancer therapeutics (kinase inhibitors, anticancer compounds)",
    "neurodegenerative disease drug discovery (Alzheimer‚Äôs)",
    "antibiotic discovery and resistance (bacterial targets)",
    "tuberculosis drug discovery",
    "antimalarial drug discovery",
    "GPCR drug discovery (ligands and receptors)",
    "metabolic disease therapeutics (diabetes enzymes)",
    "protein-protein interaction inhibitors",
    "drug target identification and validation"
]

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/890M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/890M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [31]:
topics = []
texts = []

topic_representatives = topic_model.get_representative_docs()
for topic, docs in topic_representatives.items():
    if topic == -1:
        continue
    topics.append(topic)
    texts.append(docs[0][:800])

outputs = classifier(texts, candidate_labels, batch_size=8)

zero_shot_labels = {t: out["labels"][0] for t, out in zip(topics, outputs)}
zero_shot_labels

{0: 'drug target identification and validation',
 1: 'molecular docking and virtual screening',
 2: 'tuberculosis drug discovery',
 3: 'drug target identification and validation',
 4: 'molecular docking and virtual screening',
 5: 'molecular docking and virtual screening',
 6: 'molecular docking and virtual screening',
 7: 'drug target identification and validation',
 8: 'antimalarial drug discovery',
 9: 'drug target identification and validation'}

In [33]:
info = topic_model.get_topic_info().copy()
info["zero_shot_label"] = info["Topic"].map(zero_shot_labels)
info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,zero_shot_label
0,-1,236,-1_compound_inhibitor_molecular_study,"[compound, inhibitor, molecular, study, drug, ...",[background delta play key role bcell signal t...,
1,0,366,0_drug_docking_discovery_method,"[drug, docking, discovery, method, molecular, ...",[protein threedimensional structure prediction...,drug target identification and validation
2,1,106,1_drug_virus_protease_antiviral,"[drug, virus, protease, antiviral, viral, comp...",[emergence new variant raise concern effective...,molecular docking and virtual screening
3,2,58,2_compound_drug_tuberculosis_inhibitor,"[compound, drug, tuberculosis, inhibitor, stud...",[tuberculosis tb infectious disease cause numb...,tuberculosis drug discovery
4,3,48,3_ad_disease_compound_inhibitor,"[ad, disease, compound, inhibitor, alzheimer, ...",[discovery novel multifunctional inhibitor tar...,drug target identification and validation
5,4,40,4_kinase_inhibitor_compound_cell,"[kinase, inhibitor, compound, cell, cancer, po...",[kinase key target approximately breast cancer...,molecular docking and virtual screening
6,5,30,5_inhibitor_bind_cancer_molecular,"[inhibitor, bind, cancer, molecular, study, co...",[failure chemotherapy treatment carcinoma main...,molecular docking and virtual screening
7,6,30,6_alphaglucosidase_compound_study_alphaamylase,"[alphaglucosidase, compound, study, alphaamyla...",[diabete mellitus dm multifactorial lifethreat...,molecular docking and virtual screening
8,7,24,7_gpcrs_gpcr_ligand_receptor,"[gpcrs, gpcr, ligand, receptor, agonist, allos...",[gproteincouple receptor gpcrs tractable drug ...,drug target identification and validation
9,8,23,8_antimalarial_malaria_drug_parasite,"[antimalarial, malaria, drug, parasite, new, a...",[identify novel lead compound drug discovery c...,antimalarial drug discovery


In [None]:
# saving to a csv
with open("data/zero_shot_labels.json", "w") as f:
    json.dump(zero_shot_labels, f, indent=2)

info.to_csv("data/topic_info_with_zero_shot.csv", index=False)

## 7Ô∏è‚É£ Evaluation & Visualization

In [3]:
topic_model.visualize_topics()
topic_model.visualize_barchart()
topic_model.visualize_hierarchy()

df["year"] = pd.to_numeric(df["year"], errors="coerce")

# Optional: drop rows without year
df_time = df.dropna(subset=["year"]).copy()
df_time["year"] = df_time["year"].astype(int)

# Compute topic evolution
topics_over_time = topic_model.topics_over_time(
    df_time["clean_text"],
    df_time["year"]
)

# Visualize
topic_model.visualize_topics_over_time(topics_over_time)


NameError: name 'topic_model' is not defined

In [None]:
topic_id = 0  # choose a topic
words = dict(topic_model.get_topic(topic_id))

wc = WordCloud(
    width=800,
    height=400,
    background_color="white"
).generate_from_frequencies(words)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(f"WordCloud for Topic {topic_id}")
plt.show()

In [None]:
def plot_topic_wordcloud(topic_model, topic_id, max_words=30):
    topic_words = dict(topic_model.get_topic(topic_id))

    wc = WordCloud(
        width=800,
        height=400,
        background_color="white",
        max_words=max_words,
        colormap="viridis"
    ).generate_from_frequencies(topic_words)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Topic {topic_id}", fontsize=14)
    plt.show()

In [None]:
for t in top_topics:
    label = zero_shot_labels.get(t, f"Topic {t}")
    topic_words = dict(topic_model.get_topic(t))

    wc = WordCloud(
        width=800,
        height=400,
        background_color="white",
        max_words=30,
        colormap="plasma"
    ).generate_from_frequencies(topic_words)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(label, fontsize=14)
    plt.show()