# üß¨ Discovering Emerging Topics in Drug Discovery Research using BERTopic

## 1Ô∏è‚É£Import dependencies

In [34]:
import os
import re, spacy
import pandas as pd
from Bio import Entrez, Medline
from bertopic import BERTopic
from transformers import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer


## 2Ô∏è‚É£ Data Collection
I used PubMed  to fetch abstracts on topics like 'drug discovery', 'AI drug design', and 'molecular docking'.
Store as `data/raw_publications.csv`. 

In [None]:
Entrez.email = "paulafredrick26@gmail.com"  # already set
# 1. Search PubMed
handle = Entrez.esearch(
    db="pubmed",
    term=query,
    retmax=max_results,
    sort="relevance"
)
search_results = Entrez.read(handle)
handle.close()

pmids = search_results["IdList"]
print("Number of PMIDs found:", len(pmids))

# 2. Fetch details
handle = Entrez.efetch(
    db="pubmed",
    id=",".join(pmids),
    rettype="medline",
    retmode="text"
)
records = list(Medline.parse(handle))
handle.close()

# 3. Build DataFrame
rows = []
for rec in records:
    abstract = rec.get("AB", "")
    if not abstract:
        continue
    rows.append({
        "pmid": rec.get("PMID", ""),
        "title": rec.get("TI", ""),
        "abstract": abstract,
        "journal": rec.get("JT", ""),
        "year": rec.get("DP", "")[:4]
    })

df = pd.DataFrame(rows)
print(df.shape)
df.head()
    
# 4. Save CSV
os.makedirs("data", exist_ok=True)
df.to_csv("data/raw_publications.csv", index=False)
print("Saved to data/raw_publications.csv")

Number of PMIDs found: 1000
(976, 5)
Saved to data/raw_publications.csv


### Start from Here!!

## 3Ô∏è‚É£ Data Cleaning & Preprocessing

In [32]:
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = str(text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # remove punctuation, make lowercase
    text = re.sub(r"[^\w\s]", "", text.lower())
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return " ".join(tokens)

df = pd.read_csv("data/raw_publications.csv")
df["clean_text"] = df["abstract"].apply(clean_text)
df.head()

Unnamed: 0,pmid,title,abstract,journal,year,clean_text
0,31487867,Molecular Docking: Shifting Paradigms in Drug ...,Molecular docking is an established in silico ...,International journal of molecular sciences,2019,molecular docking establish silico structureba...
1,26205061,Molecular docking and structure-based drug des...,Pharmaceutical research has successfully incor...,"Molecules (Basel, Switzerland)",2015,pharmaceutical research successfully incorpora...
2,38594926,The Art and Science of Molecular Docking.,Molecular docking has become an essential part...,Annual review of biochemistry,2024,molecular docking essential structural biologi...
3,34560276,Machine-learning methods for ligand-protein mo...,Artificial intelligence (AI) is often presente...,Drug discovery today,2022,artificial intelligence ai present new industr...
4,34147204,Use of molecular docking computational tools i...,Molecular docking has become an important comp...,Progress in medicinal chemistry,2021,molecular docking important component drug dis...


## 4Ô∏è‚É£ Embedding Generation

In [35]:

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(
    df['clean_text'], 
    show_progress_bar=True, 
    convert_to_numpy=True
)

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

## 5Ô∏è‚É£ Hyperparameter Tuning

In [36]:
# Experiment
umap_params = [
    {"n_neighbors": 5,  "n_components": 5},
    {"n_neighbors": 10, "n_components": 5},
    {"n_neighbors": 15, "n_components": 5},
    {"n_neighbors": 5,  "n_components": 10},
    {"n_neighbors": 10, "n_components": 10},
    {"n_neighbors": 15, "n_components": 10},
]

hdbscan_params = [
    {"min_cluster_size": 8},
    {"min_cluster_size": 12},
]

In [37]:
results = []

for u in umap_params:
    for h in hdbscan_params:

        umap_model = UMAP(
            n_neighbors=u["n_neighbors"],
            n_components=u["n_components"],
            min_dist=0.0,
            metric="cosine",
            random_state=42
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=h["min_cluster_size"],
            metric="euclidean",
            cluster_selection_method="eom"
        )

        topic_model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            verbose=False
        )

        topics, _ = topic_model.fit_transform(df["clean_text"], embeddings)

        info = topic_model.get_topic_info()

        n_topics = len(info[info.Topic != -1])
        noise_docs = info[info.Topic == -1]["Count"].values[0] if -1 in info.Topic.values else 0

        results.append({
            "n_neighbors": u["n_neighbors"],
            "n_components": u["n_components"],
            "min_cluster_size": h["min_cluster_size"],
            "n_topics": n_topics,
            "noise_docs": noise_docs
        })


In [38]:
results_df = pd.DataFrame(results)
results_df.sort_values("n_topics", ascending=False)

Unnamed: 0,n_neighbors,n_components,min_cluster_size,n_topics,noise_docs
8,10,10,8,23,287
2,10,5,8,22,345
4,15,5,8,21,362
0,5,5,8,21,222
6,5,10,8,19,238
10,15,10,8,17,279
11,15,10,12,10,249
3,10,5,12,10,236
5,15,5,12,9,263
7,5,10,12,6,64


## 6Ô∏è‚É£ Topic Modelling with BERTopic

In [39]:
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=12, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(df['clean_text'], embeddings)
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,219,-1_compound_study_inhibitor_drug,"[compound, study, inhibitor, drug, molecular, ...",[inhibition janus kinase member jak family tyr...
1,0,379,0_drug_docking_discovery_method,"[drug, docking, discovery, method, molecular, ...",[conventional drug discovery approach expensiv...
2,1,105,1_virus_drug_protease_antiviral,"[virus, drug, protease, antiviral, viral, comp...",[emergence new variant raise concern effective...
3,2,53,2_kinase_inhibitor_compound_cell,"[kinase, inhibitor, compound, cell, cancer, mo...",[kinase key target approximately breast cancer...
4,3,44,3_inhibitor_cancer_bind_compound,"[inhibitor, cancer, bind, compound, molecular,...",[background delta play key role bcell signal t...
5,4,43,4_ad_disease_compound_inhibitor,"[ad, disease, compound, inhibitor, alzheimer, ...",[discovery novel multifunctional inhibitor tar...
6,5,30,5_alphaglucosidase_compound_alphaamylase_study,"[alphaglucosidase, compound, alphaamylase, stu...",[diabete mellitus dm multifactorial lifethreat...
7,6,24,6_gpcrs_ligand_receptor_gpcr,"[gpcrs, ligand, receptor, gpcr, agonist, allos...",[gproteincouple receptor gpcrs tractable drug ...
8,7,23,7_bacterial_antibiotic_inhibitor_aureus,"[bacterial, antibiotic, inhibitor, aureus, gyr...",[betalactamase ampc general cause onset antibi...
9,8,23,8_antimalarial_malaria_drug_parasite,"[antimalarial, malaria, drug, parasite, new, a...",[identify novel lead compound drug discovery c...


##### Topic evolution over time

In [40]:
topics_over_time = topic_model.topics_over_time(
    df["clean_text"],
    df["year"]
)
topics_over_time 

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"inhibitor, study, covalent, compound, design",13,2015
1,0,"docking, drug, discovery, screening, molecular",33,2015
2,1,"virus, dengue, hbv, pr, protease",3,2015
3,2,"biphenyl, amide, inhibitor, indicate, value",1,2015
4,3,"hadometdc, inhibitor, polyamine, nec, model",3,2015
...,...,...,...,...
114,3,"kcal, caffeic, phytoconstituent, ehretia, cancer",2,2025
115,4,"fisher, target, identification, aid, bioassays",1,2025
116,5,"kaempferol, alphaamylase, alphaglucosidase, an...",2,2025
117,7,"ftsz, pneumoniae, inhibitor, leprosy, murg",3,2025


## 7Ô∏è‚É£ Zero-Shot Topic Labeling for Enhanced Interpretability
To improve interpretability of the unsupervised BERTopic results, I applied a zero-shot classification model (BART-MNLI) to automatically assign semantic labels to the representative documents of each topic.

In [41]:
classifier = pipeline("zero-shot-classification", 
                      model="facebook/bart-large-mnli")

candidate_labels = [
    "AI drug design",
    "molecular docking",
    "protein folding",
    "cancer therapeutics",
    "genomics",
    "drug target discovery",
    "machine learning methods"
]

Device set to use cpu


In [22]:
#Label one topic at a time
topic_representatives = topic_model.get_representative_docs()

zero_shot_labels = {}

for topic, docs in topic_representatives.items():
    text = docs[0]  # representative document
    result = classifier(text, candidate_labels)
    zero_shot_labels[topic] = result["labels"][0]  # best label
zero_shot_labels

{-1: 'molecular docking', 0: 'molecular docking', 1: 'molecular docking'}

## 8 Evaluation & Visualization

In [42]:

topic_model.visualize_topics()
topic_model.visualize_barchart()
topic_model.visualize_hierarchy()
topic_model.visualize_topics_over_time(df['clean_text'], df['publication_date'])


KeyError: 'publication_date'

## 8Ô∏è‚É£ Results & Insights
- List and interpret top topics.
- Identify emerging or declining research themes.


## 9Ô∏è‚É£ Conclusion
- Summarize key findings.
- Highlight business and scientific relevance.
- Suggest future work (e.g., comparing journals, regions, or institutions).
