In [None]:

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from nltk.tokenize import wordpunct_tokenize
import polars as pl
import gc
from tqdm import tqdm
import torch
import numpy as np


cols = ["title",
        "year",
        "primary_topic",
        "abstract",
        "cited_by_count",
        "mncs",
        "countries_distinct_count",
        "institutions_distinct_count",
        "referenced_works_count",
        "authors_count",
        "review",
        "meta_analysis",
        "mean_past_contributions_authors",
        "mean_past_mncs_authors",
        "mean_past_contributions_institutions",
        "mean_past_mncs_institutions"]
cols += [f"cited_by_count_{i}" for i in range(2012, 2025)]

# Ran with Google Colab
from google.colab import drive
drive.mount('/content/drive')
data = pl.read_parquet("/content/drive/MyDrive/works_pre_topics.parquet", columns = cols) #

data = data.with_columns(
    title_abstract = pl.col("title") + ". " + pl.col("abstract")
)

data = data.filter(pl.col("title_abstract").is_not_null())
data = data.filter(pl.col("title_abstract") != "")

custom_stop_words = [
    "current", "board", "editorial", "thank", "contents", "table", "availability",
    "matter", "talking", "question", "older", "erratum", "eradication", "approval",
    "global", "correction", "written", "issue", "information", "publication",
    "publishing", "implementing", "noticeboard", "problems"
]

all_stop_words = set(ENGLISH_STOP_WORDS).union(custom_stop_words)

def clean_text(text):
    tokens = wordpunct_tokenize(text.lower())
    return ' '.join([word for word in tokens if word not in all_stop_words])

cleaned_title_abstract = [clean_text(doc) for doc in tqdm(data["title_abstract"], desc="Cleaning")]
gc.collect()


embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

batch_size = 128
n_docs = len(cleaned_title_abstract)
n_batches = (n_docs // batch_size) + 1

embeddings_list = []

for i in tqdm(range(n_batches), desc="Text Encoding...", ncols=100):
    start = i * batch_size
    end = start + batch_size
    batch_texts = cleaned_title_abstract[start:end]

    batch_embeddings = embedding_model.encode(
        batch_texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=torch._functionalize_set_storage_changed
    )

    embeddings_list.append(batch_embeddings)

embeddings = np.vstack(embeddings_list)

gc.collect()
torch.cuda.empty_cache()

print(embeddings)

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    metric="cosine",
    random_state=42,
    low_memory=True
)

vectorizer_model = CountVectorizer(
    stop_words=list(all_stop_words),
    lowercase=True,
    token_pattern=r"(?u)\b\w\w+\b"
)

from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=30,
    min_samples=5,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
)

topic_model = BERTopic(
    embedding_model=None,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    language="english",
    vectorizer_model=vectorizer_model,
    verbose=True
)

if isinstance(embeddings, torch.Tensor):
    embeddings_np = embeddings.cpu().numpy()
else:
    embeddings_np = embeddings

for _ in tqdm(range(3), desc="Clusturing Init...", ncols=100):
    gc.collect()

sample_size = min(100000, len(cleaned_title_abstract))
sample_texts = cleaned_title_abstract[:sample_size]
sample_embeddings = embeddings_np[:sample_size]

topic_model.fit(sample_texts, embeddings=sample_embeddings)

batch_size = 1000
n_samples = len(cleaned_title_abstract)

all_topics = []
all_probs = []

for start_idx in tqdm(range(0, n_samples, batch_size), desc="Processing batches"):
    end_idx = min(start_idx + batch_size, n_samples)
    batch_texts = cleaned_title_abstract[start_idx:end_idx]
    batch_embeddings = embeddings_np[start_idx:end_idx]

    topics, probs = topic_model.transform(batch_texts, embeddings=batch_embeddings)

    all_topics.extend(topics)
    all_probs.append(probs)

    del batch_texts, batch_embeddings, topics, probs
    gc.collect()


topic_model.save("/content/drive/MyDrive/bertopic_model_pharmacology2")
all_topics_series = pl.Series("topic", all_topics, dtype=pl.Int64)
data = data.with_columns(all_topics_series)
data.write_parquet("/content/drive/MyDrive/works_post_topics2.parquet")


import pandas as pd


topic_modelling = BERTopic.load("/content/drive/MyDrive/bertopic_model_pharmacology2", embedding_model=embedding_model)
test = topic_modelling.get_topics()
test = pd.DataFrame(test)
test.to_csv("topic_words2.csv")



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


topic_freq_df = topic_modelling.get_topic_freq()
topic_freq_df = topic_freq_df[~topic_freq_df['Topic'].isin([-1, 0])]


top_n = 20
top_topics = topic_freq_df.head(top_n)['Topic'].tolist()


# Load topic words from CSV but added, 
# Not sure if it works properly, need to check 

topics_csv = pd.read_csv("topic_words2.csv", index_col=0) 

def get_topic_words(x):
    try:
        word1 = topics_csv.loc[0, str(x)].split(",")[0].replace("('","").replace("'","").strip()
        word2 = topics_csv.loc[1, str(x)].split(",")[0].replace("('","").replace("'","").strip()
        return f"{word1}_{word2}"
    except:
        return "Unknown"

topic_names = {tid: get_topic_words(tid) for tid in top_topics}

embeddings = topic_modelling.topic_embeddings_
indices = [topic_modelling.get_topic_freq().index[topic_modelling.get_topic_freq()['Topic']==tid][0] for tid in top_topics]
emb_top = embeddings[indices]

sim = cosine_similarity(emb_top)

plt.figure(figsize=(12,10))
im = plt.imshow(sim, cmap='viridis')
plt.colorbar(im, label='Cosine Similarity')

names = [topic_names[t] for t in top_topics]
plt.xticks(np.arange(top_n), names, rotation=90, fontsize=10)
plt.yticks(np.arange(top_n), names, fontsize=10)
plt.title("Topic Similarity Heatmap (Top 20 Topics)")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/pngs/topic_similarity_heatmap_top20.png")
plt.show()


umap_embeddings = topic_modelling.topic_embeddings_[indices]
freqs = topic_freq_df.set_index('Topic')['Count'].to_dict()

x = umap_embeddings[:,0]
y = umap_embeddings[:,1]
sizes = [freqs.get(t, 10) for t in top_topics]

plt.figure(figsize=(12,10))
plt.scatter(x, y, s=np.sqrt(sizes)*10, alpha=0.7, c='steelblue', edgecolor='k')
for i, tid in enumerate(top_topics):
    plt.text(x[i], y[i], topic_names[tid], fontsize=10, ha='center', va='center')

plt.title("Topic Map (Top 20 Topics)")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/pngs/topic_map_top20.png")
plt.show()


topic_id = top_topics[0]
topic_name = get_topic_words(topic_id)

words_scores = topic_modelling.get_topic(topic_id)
top_words = words_scores[:10]
labels = [w[0] for w in top_words]
scores = [w[1] for w in top_words]

plt.figure(figsize=(8,5))
plt.barh(labels, scores, color='steelblue')
plt.gca().invert_yaxis()
plt.title(f"Top 10 Words for Topic: {topic_name}")
plt.xlabel("c-TF-IDF Score")
plt.tight_layout()
plt.savefig(f"/content/drive/MyDrive/pngs/top10_words_{topic_name}.png")
plt.show()

