# üß¨ Discovering Emerging Topics in Drug Discovery Research using BERTopic

## 1. Import dependencies

In [2]:
import os
import re, spacy
import pandas as pd
from Bio import Entrez, Medline
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer


## 2Ô∏è‚É£ Data Collection
I Use PubMed  to fetch abstracts on topics like 'drug discovery', 'AI drug design', and 'molecular docking'.
Store as `data/raw_publications.csv`. 

In [6]:
Entrez.email = "paulafredrick26@gmail.com"  # already set

# 1. Search PubMed
handle = Entrez.esearch(
    db="pubmed",
    term=query,
    retmax=max_results,
    sort="relevance"
)
search_results = Entrez.read(handle)
handle.close()

pmids = search_results["IdList"]
print("Number of PMIDs found:", len(pmids))

# 2. Fetch details
handle = Entrez.efetch(
    db="pubmed",
    id=",".join(pmids),
    rettype="medline",
    retmode="text"
)
records = list(Medline.parse(handle))
handle.close()

# 3. Build DataFrame
rows = []
for rec in records:
    abstract = rec.get("AB", "")
    if not abstract:
        continue
    rows.append({
        "pmid": rec.get("PMID", ""),
        "title": rec.get("TI", ""),
        "abstract": abstract,
        "journal": rec.get("JT", ""),
        "year": rec.get("DP", "")[:4]
    })

df = pd.DataFrame(rows)
print(df.shape)
df.head()
    
# 4. Save CSV
os.makedirs("data", exist_ok=True)
df.to_csv("data/raw_publications.csv", index=False)
print("Saved to data/raw_publications.csv")

Number of PMIDs found: 1000
(976, 5)
Saved to data/raw_publications.csv


### Start from Here!!

## 3Ô∏è‚É£ Data Cleaning & Preprocessing

In [3]:
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = str(text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text)
    # remove punctuation, make lowercase
    text = re.sub(r"[^\w\s]", "", text.lower())
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return " ".join(tokens)

df = pd.read_csv("data/raw_publications.csv")
df["clean_text"] = df["abstract"].apply(clean_text)
df.head()


Unnamed: 0,pmid,title,abstract,journal,year,clean_text
0,31487867,Molecular Docking: Shifting Paradigms in Drug ...,Molecular docking is an established in silico ...,International journal of molecular sciences,2019,molecular docking establish silico structureba...
1,26205061,Molecular docking and structure-based drug des...,Pharmaceutical research has successfully incor...,"Molecules (Basel, Switzerland)",2015,pharmaceutical research successfully incorpora...
2,38594926,The Art and Science of Molecular Docking.,Molecular docking has become an essential part...,Annual review of biochemistry,2024,molecular docking essential structural biologi...
3,34560276,Machine-learning methods for ligand-protein mo...,Artificial intelligence (AI) is often presente...,Drug discovery today,2022,artificial intelligence ai present new industr...
4,34147204,Use of molecular docking computational tools i...,Molecular docking has become an important comp...,Progress in medicinal chemistry,2021,molecular docking important component drug dis...


## 4Ô∏è‚É£ Embedding Generation

In [4]:

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(
    df['clean_text'], 
    show_progress_bar=True, 
    convert_to_numpy=True
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

## 5Ô∏è‚É£ Topic Modelling with BERTopic

In [7]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=25, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(df['clean_text'], embeddings)
topic_model.get_topic_info().head(20)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,61,-1_covalent_drug_target_compound,"[covalent, drug, target, compound, new, study,...",[mechanism action covalent drug involve format...
1,0,767,0_drug_molecular_compound_discovery,"[drug, molecular, compound, discovery, docking...",[conventional drug discovery approach expensiv...
2,1,107,1_drug_virus_protease_compound,"[drug, virus, protease, compound, inhibitor, a...",[emergence new variant raise concern effective...
3,2,41,2_ad_disease_compound_inhibitor,"[ad, disease, compound, inhibitor, drug, molec...",[alzheimer disease ad mark tau tangle amyloidb...


In [9]:
topic_model.get_topic_info().head(20)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,61,-1_covalent_drug_target_compound,"[covalent, drug, target, compound, new, study,...",[mechanism action covalent drug involve format...
1,0,767,0_drug_molecular_compound_discovery,"[drug, molecular, compound, discovery, docking...",[conventional drug discovery approach expensiv...
2,1,107,1_drug_virus_protease_compound,"[drug, virus, protease, compound, inhibitor, a...",[emergence new variant raise concern effective...
3,2,41,2_ad_disease_compound_inhibitor,"[ad, disease, compound, inhibitor, drug, molec...",[alzheimer disease ad mark tau tangle amyloidb...


## 6Ô∏è‚É£ Zero-Shot Topic Labeling for Enhanced Interpretability
To improve interpretability of the unsupervised BERTopic results, I applied a zero-shot classification model (BART-MNLI) to automatically assign semantic labels to the representative documents of each topic.

## 7Ô∏è‚É£ Hyperparameter Tuning

In [None]:

for n in [5, 15, 30]:
    for min_cluster in [10, 25, 50]:
        umap_model = UMAP(n_neighbors=n, n_components=5, min_dist=0.0, metric='cosine')
        hdbscan_model = HDBSCAN(min_cluster_size=min_cluster)
        topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
        topics, probs = topic_model.fit_transform(df['clean_text'])
        print(f"Params: n={n}, min_cluster={min_cluster}")
        print(topic_model.get_topic_info().head())


## 8 Evaluation & Visualization

In [None]:

topic_model.visualize_topics()
topic_model.visualize_barchart()
topic_model.visualize_hierarchy()
topic_model.visualize_topics_over_time(df['clean_text'], df['publication_date'])


## 8Ô∏è‚É£ Results & Insights
- List and interpret top topics.
- Identify emerging or declining research themes.


##### Zero-Shot Learning

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", 
                      model="facebook/bart-large-mnli")

candidate_labels = [
    "AI drug design",
    "molecular docking",
    "protein folding",
    "cancer therapeutics",
    "genomics",
    "drug target discovery",
    "machine learning methods"
]

Label one topic at a time

In [None]:
topic_representatives = topic_model.get_representative_docs()

zero_shot_labels = {}

for topic, docs in topic_representatives.items():
    text = docs[0]  # representative document
    result = classifier(text, candidate_labels)
    zero_shot_labels[topic] = result["labels"][0]  # best label
zero_shot_labels


## 9Ô∏è‚É£ Conclusion
- Summarize key findings.
- Highlight business and scientific relevance.
- Suggest future work (e.g., comparing journals, regions, or institutions).
