Purpose of this notebook is to perform Topic Modelling on the bigger groups from `parent-hub` to `healthhub--parenting`

Please run `group_program_subpages.ipynb` first to get the relevant excel file to run this notebook.

After running the cells under `Setup`, ensure that a folder named `bertopic program subpages` is created in this `notebooks` folder in order to store the created excels later on.

Note:
- Both w/o hyperparameter tuning and with hyperparameter tuning cells need to be ran for each group as it is referencing the same topicModel variable

<hr>

## Setup

In [1]:
%load_ext kedro.ipython

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hinat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
import os
import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance  # KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

In [20]:
folder_path = 'bertopic program subpages'
if not os.path.exists(folder_path):
    # If it doesn't exist, create the folder
    os.makedirs(folder_path)
    print(f"Folder created: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")

Folder already exists: bertopic program subpages


## Prep Stop Words

In [134]:
# Define custom stop words list
my_stop_words = ['api', 'healthhub', 'sg', 'content', 'https', 'ch', 'www', 'https', 'http', 'pdf', 'melayu', 'singapore', 'download', 'english', 'public', 'im']
# Combine custom stop words with English stop words
combined_stop_words = list(ENGLISH_STOP_WORDS.union(my_stop_words))

<hr>

## Program Sub Pages

For testing

<hr>

In [4]:
df_csv = pd.read_excel(r"program-sub-pages\cleaned_programSubpages.xlsx", sheet_name="cleaned_program_subpages_url")
df_csv

Unnamed: 0,id,title,full_url,extracted_content_body,content_category
0,1434919,MindSG,https://www.healthhub.sg/programmes/MindSG/Car...,Caring for Ourselves\nSleeping Well\nSelect th...,program-sub-pages
1,1480345,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,[https://go.gov.sg/useh365] [https://go.gov.s...,program-sub-pages
2,1435221,Parent Hub: Student Immunisation And Screening,https://www.healthhub.sg/programmes/parent-hub...,CHILD IMMUNISATION AND SCREENING SERVICES\nThe...,program-sub-pages
3,1434809,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/tracke...,< Previous [#nscMastheadCarousel] > Next [#nsc...,program-sub-pages
4,1435018,Reduce Your Salt And Sugar Intake,https://www.healthhub.sg/programmes/nutrition-...,Menu [#] [#clear]\n\nMenu [#] [#clear]\n\n...,program-sub-pages
...,...,...,...,...,...
298,1435127,Types of diabetes | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BEaTMS TO BEAT DIABETES [/programmes/diabete...,program-sub-pages
299,1435167,Hypoglycaemia | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages
300,1468676,Program Sub level 1,https://www.healthhub.sg/programmes/1test/sya-...,HealthHub\nRelaxation-ExerciseHealthhub\nList ...,program-sub-pages
301,1435215,If you have coronary heart disease | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages


In [5]:
df_csv = df_csv.dropna(subset=["extracted_content_body"])
docs_psb = df_csv["extracted_content_body"].to_list()
doc_titles_psb = df_csv["title"].to_list()
len(docs_psb)

[1;36m303[0m

### Without hyperparameters tuning

In [6]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

In [7]:
topics, probs = topic_model.fit_transform(docs_psb)

In [8]:
result = topic_model.get_topic_info()
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPagesAll.csv")

In [9]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    for title, topic in zip(doc_titles_psb, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [10]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

Best DBCV score: 0.540
Best parameters: {'min_cluster_size': 2, 'min_samples': 2, 'cluster_selection_method': 'eom', 'metric': 'manhattan'}


In [11]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

In [12]:
topics, probs = topic_model.fit_transform(docs_psb)

In [13]:
result = topic_model.get_topic_info()
result

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,30,-1_download https_pdf_api_public content,"[download https, pdf, api, public content, api...",[Resources and Activities\n\n0-2 YEARS\n(in al...
1,0,60,0_sugar_blood sugar_insulin_monitoring,"[sugar, blood sugar, insulin, monitoring, carb...",[3 BES TO BEAT DIABETES [/programmes/diabetes-...
2,1,13,1_pressure injury_pressure injuries_injuries p...,"[pressure injury, pressure injuries, injuries ...",[Menu - Pressure Injury Hub\n- Preventing Pres...
3,2,12,2_challenge_corporate challenge_national steps...,"[challenge, corporate challenge, national step...",[<Previous [#nscMastheadCarousel] >Next [#nscM...
4,3,11,3_screen time_healthy baby_vaccinations_18 months,"[screen time, healthy baby, vaccinations, 18 m...",[Primary [#]\nPrimary\n- home\n- Were expectin...
5,4,11,4_emotions_stress_power_manage emotions,"[emotions, stress, power, manage emotions, man...",[Caring for Ourselves\nCoping with Stress\nSel...
6,5,10,5_programmes parent_soon register_hub preschoo...,"[programmes parent, soon register, hub prescho...",[you may also\n\nlike [/programmes/parent-hub...
7,6,10,6_classes_classes activities_dance aerobics_ac...,"[classes, classes activities, dance aerobics, ...",[Resources and Activities\n\nSTAY POSITIVE\n(i...
8,7,9,7_postnatal depression_hiddenlearn live_pnd kk...,"[postnatal depression, hiddenlearn live, pnd k...","[By now, your maternity leave will be over soo..."
9,8,8,8_bedtime_spotify_download audio_captain sleep,"[bedtime, spotify, download audio, captain sle...",[INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\n...


In [14]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    for title, topic in zip(doc_titles_psb, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

In [15]:
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPagesAll_tuned.csv")

## 1. Parent Hub

In [103]:
df_parentHub = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="parent_hub")
df_parentHub

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1435221,Parent Hub: Student Immunisation And Screening,https://www.healthhub.sg/programmes/parent-hub...,CHILD IMMUNISATION AND SCREENING SERVICES\nThe...,program-sub-pages,parent-hub
1,1434753,Parent Hub: 0-2 Years - Healthy Diet,https://www.healthhub.sg/programmes/parent-hub...,fo \n MEAL TIMES To view all content in this s...,program-sub-pages,parent-hub
2,1435359,Parent Hub: 0-2 Years,https://www.healthhub.sg/programmes/parent-hub...,Here's How to Team Up with Your Wife for Paren...,program-sub-pages,parent-hub
3,1435231,"Parent Hub: Student Health Centre, Dental Centre",https://www.healthhub.sg/programmes/parent-hub...,STUDENT HEALTH CENTRE AND STUDENT DENTAL CENTR...,program-sub-pages,parent-hub
4,1434755,Parent Hub: 3-6 Years,https://www.healthhub.sg/programmes/parent-hub...,Handy Guide to Screen Use\nYour handy guide to...,program-sub-pages,parent-hub
...,...,...,...,...,...,...
66,1470195,test,https://www.healthhub.sg/programmes/parent-hub...,pui-yi [https://ch-api.healthhub.sg/api/public...,program-sub-pages,parent-hub
67,1434825,Parent Hub: 7-12 Years - Emotional Health,https://www.healthhub.sg/programmes/parent-hub...,Primary [#]\nPrimary\n- home\n- Were expecting...,program-sub-pages,parent-hub
68,1434781,Parent Hub: Quizzes - How Well Do You Know the...,https://www.healthhub.sg/programmes/parent-hub...,How Well Do You Know the Nutritional Informati...,program-sub-pages,parent-hub
69,1473804,sleep-test,https://www.healthhub.sg/programmes/parent-hub...,INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,program-sub-pages,parent-hub


In [104]:
df_parentHub = df_parentHub.dropna(subset=["extracted_content_body"])
docs_parentHub = df_parentHub["extracted_content_body"].to_list()
doc_titles_parentHub = df_parentHub["title"].to_list()
len(docs_parentHub)

[1;36m71[0m

### Without hyperparameter tuning

In [105]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=0.8)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

topics, probs = topic_model.fit_transform(docs_parentHub)

result = topic_model.get_topic_info()
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_parentHub.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    for title, topic in zip(doc_titles_parentHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameters tuning

In [109]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=0.8)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

topics, probs = topic_model.fit_transform(docs_parentHub)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.545
Best parameters: {'min_cluster_size': 7, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6,-1_learn live_student_health promotion_months ...,"[learn live, student, health promotion, months...",[Healthy Sleep Habits in Children and Adolesce...
1,0,21,0_preschool_soon register_years programmes_hub...,"[preschool, soon register, years programmes, h...",[you may also\n\nlike [/programmes/parent-hub...
2,1,18,1_pregnancy_diabetes_healthy baby_healthy preg...,"[pregnancy, diabetes, healthy baby, healthy pr...",[Here's How to Team Up with Your Wife for Pare...
3,2,11,2_live healthy_esteem_exam stress_health scree...,"[live healthy, esteem, exam stress, health scr...",[Lets Get Real About Vaping - 8 Reasons Why Yo...
4,3,8,3_healthy recipes_chef_live healthy_balance,"[healthy recipes, chef, live healthy, balance,...",[Primary [#]\nPrimary\n- home\n- Were expectin...
5,4,7,4_bedtime_spotify_captain sleep_book audio,"[bedtime, spotify, captain sleep, book audio, ...",[INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\n...


In [107]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_parentHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_parentHub_tuned.csv")
visualization.show()

## 2. MindSG

In [22]:
df_mindSG = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="MindSG")
df_mindSG

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1434919,MindSG,https://www.healthhub.sg/programmes/MindSG/Car...,Caring for Ourselves\nSleeping Well\nSelect th...,program-sub-pages,MindSG
1,1434871,MindSG,https://www.healthhub.sg/programmes/MindSG/Dis...,Are we giving the right support?\nLearn how we...,program-sub-pages,MindSG
2,1435236,MindSG,https://www.healthhub.sg/programmes/MindSG/Sle...,[#helplines] [#helplines]\n\nSleep Tracking F...,program-sub-pages,MindSG
3,1435243,MindSG,https://www.healthhub.sg/programmes/MindSG/See...,Seeking Support\nChoose what youd like to read...,program-sub-pages,MindSG
4,1434875,MindSG,https://www.healthhub.sg/programmes/MindSG/Abo...,What is Mental Well-being\nChoose what youd li...,program-sub-pages,MindSG
...,...,...,...,...,...,...
65,1434851,MindSG,https://www.healthhub.sg/programmes/MindSG/Sle...,"Why do we need sleep?\nRegular, adequate, and ...",program-sub-pages,MindSG
66,1501419,MindSG,https://www.healthhub.sg/programmes/MindSG/Car...,Caring for Ourselves\nUnderstanding Bipolar Di...,program-sub-pages,MindSG
67,1465384,MindSG,https://www.healthhub.sg/programmes/MindSG/Car...,Caring for Others\nOur Elderly Parents\nSelect...,program-sub-pages,MindSG
68,1434843,MindSG,https://www.healthhub.sg/programmes/MindSG/Car...,"If not redirected, click this Managing Our Emo...",program-sub-pages,MindSG


In [23]:
df_mindSG = df_mindSG.dropna(subset=["extracted_content_body"])
docs_mindSG = df_mindSG["extracted_content_body"].to_list()
name_mindSG = 'MindSG'
doc_titles_mindSG = df_mindSG["title"].to_list()
len(docs_mindSG)

[1;36m70[0m

### Without hyperparameter tuning

In [36]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_mindSG)

result = topic_model.get_topic_info()
# CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_mindSG.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_mindSG, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [37]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_mindSG)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.563
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3,-1_teen struggling_concern behavioural_signs t...,"[teen struggling, concern behavioural, signs t...",[Caring for Ourselves\nLiving with OCD\nSelect...
1,0,8,0_healthy 365_365 app_tracking sleep_sync,"[healthy 365, 365 app, tracking sleep, sync, t...",[Caring for Ourselves\nSleeping Well\nSelect t...
2,1,7,1_tips caring_tips share_stop worrying_useful ...,"[tips caring, tips share, stop worrying, usefu...",[Caring for Others\nMy Friend\nCaring for\nMy ...
3,2,6,2_strongly agree_text institute_mental setting...,"[strongly agree, text institute, mental settin...",[Caring for Ourselves\nUnderstanding Depressio...
4,3,6,3_acc manage_basic emotions_emotions allows_re...,"[acc manage, basic emotions, emotions allows, ...",[Caring for Ourselves\nManaging Our Emotions\n...
5,4,5,4_cope transitions_transitions coping_emotions...,"[cope transitions, transitions coping, emotion...",[Caring for Ourselves\nTransition\nSelect the ...
6,5,5,5_navc_warning signs_thoughts_navc_warning sig...,"[navc_warning signs_thoughts, navc_warning sig...",[Caring for Ourselves\nPreventing\nSelf-harm\n...
7,6,4,6_symptoms stress_applied time_apply applied_d...,"[symptoms stress, applied time, apply applied,...",[Caring for Ourselves\nCoping with Stress\nSel...
8,7,4,7_determine anxiety_tests determine_anxiety al...,"[determine anxiety, tests determine, anxiety a...",[Caring for Ourselves\nDealing with\nAnxiety D...
9,8,4,8_symptom grief_loss sleep_appetite leading_lo...,"[symptom grief, loss sleep, appetite leading, ...",[Caring for Ourselves\nCoping with Grief\nSele...


In [38]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_mindSG, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_mindSG_tuned.csv")
visualization.show()

## 3. Diabetes Hub

In [111]:
df_diabetesHub = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="diabetes_hub_concatenated")
df_diabetesHub

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1435164,Self-monitoring of blood sugar | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub
1,1435281,Diabetes Hub: Guide to Managing Diabetes,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub
2,1435129,Be Aware - What is diabetes,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub
3,1435171,Hyperglycaemia | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BEaTMS TO BEAT DIABETES [/programmes/diabete...,program-sub-pages,diabetes-hub
4,1435188,Download resources | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub
...,...,...,...,...,...,...
56,1435204,Understanding carbohydrates | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub-v2
57,1435139,Monitoring blood sugar for exercise | Diabetes...,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub-v2
58,1435127,Types of diabetes | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BEaTMS TO BEAT DIABETES [/programmes/diabete...,program-sub-pages,diabetes-hub-v2
59,1435167,Hypoglycaemia | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,3 BES TO BEAT DIABETES [/programmes/diabetes-h...,program-sub-pages,diabetes-hub-v2


In [112]:
df_diabetesHub = df_diabetesHub.dropna(subset=["extracted_content_body"])
docs_diabetesHub = df_diabetesHub["extracted_content_body"].to_list()
doc_titles_diabetesHub = df_diabetesHub["title"].to_list()
len(docs_diabetesHub)

[1;36m61[0m

### Without hyperparameter tuning

In [113]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_diabetesHub)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_diabetesHub.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_diabetesHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [114]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_diabetesHub)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.357
Best parameters: {'min_cluster_size': 6, 'min_samples': 2, 'cluster_selection_method': 'leaf', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,13,-1_vial_subsidy_income_hba1c blood,"[vial, subsidy, income, hba1c blood, soap, med...",[- \n Home [#]>\nTake Control - Foot and Dent...
1,0,16,0_exercising_mobility_stretch_mondays,"[exercising, mobility, stretch, mondays, lunch...",[3 BES TO BEAT DIABETES [/programmes/diabetes-...
2,1,10,1_distress_diabetes distress_loved ones_self care,"[distress, diabetes distress, loved ones, self...",[3 BEaTMS TO BEAT DIABETES [/programmes/diabet...
3,2,9,2_blood vessel_lead complications_ignored lead...,"[blood vessel, lead complications, ignored lea...",[3 BES TO BEAT DIABETES [/programmes/diabetes-...
4,3,7,3_pancreas insulin_insulin factory_internal fa...,"[pancreas insulin, insulin factory, internal f...",[3 BES TO BEAT DIABETES [/programmes/diabetes-...
5,4,6,4_servings_food labels_saturated fat_sugar sub...,"[servings, food labels, saturated fat, sugar s...",[- \n Home [#]>\nBe aware - Managing Your Con...


In [115]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_diabetesHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_diabetesHub_tuned.csv")
visualization.show()

## 4. NSC

In [117]:
df_nsc = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="nsc")
df_nsc

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1434809,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/tracke...,< Previous [#nscMastheadCarousel] > Next [#nsc...,program-sub-pages,nsc
1,1434811,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/support/,Previous [#nscMastheadCarousel] > Next [#nscMa...,program-sub-pages,nsc
2,1434803,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/corpor...,< /> Previous [#nscMastheadCarousel] > Next [#...,program-sub-pages,nsc
3,1434969,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/corpor...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc
4,1434823,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/commun...,The National Steps Challenge\nSeason 5 Communi...,program-sub-pages,nsc
5,1434973,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/scan-a...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc
6,1434891,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/monthl...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc
7,1434821,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/themat...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc
8,1434879,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/team-s...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc
9,1434990,National Steps Challenge™,https://www.healthhub.sg/programmes/nsc/themat...,<Previous [#nscMastheadCarousel] >Next [#nscMa...,program-sub-pages,nsc


In [118]:
df_nsc = df_nsc.dropna(subset=["extracted_content_body"])
docs_nsc = df_nsc["extracted_content_body"].to_list()
doc_titles_nsc = df_nsc["title"].to_list()
len(docs_nsc)

[1;36m13[0m

### Without hyperparameter tuning

In [119]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_nsc)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_nsc.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_nsc, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [120]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_nsc)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.151
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1,-1_organisations workforce_acra number_organis...,"[organisations workforce, acra number, organis...",[< /> Previous [#nscMastheadCarousel] > Next [...
1,0,6,0_healthy workplace_heartland malls_workplace ...,"[healthy workplace, heartland malls, workplace...",[<Previous [#nscMastheadCarousel] >Next [#nscM...
2,1,2,1_book appointment_eligibility_cara_collection...,"[book appointment, eligibility, cara, collecti...",[< Previous [#nscMastheadCarousel] > Next [#ns...
3,2,2,2_scan win_win challenge_qr codes_stepschallen...,"[scan win, win challenge, qr codes, stepschall...",[<Previous [#nscMastheadCarousel] >Next [#nscM...
4,3,2,3_participants eligible_promotions_age obtain_...,"[participants eligible, promotions, age obtain...",[<Previous [#nscMastheadCarousel] >Next [#nscM...


In [121]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_nsc, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_nsc_tuned.csv")
visualization.show()

## 5. Pressure Injury

In [129]:
df_pressureInjury = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="pressure_injury")
df_pressureInjury

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1435123,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
1,1435275,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
2,1435149,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
3,1435116,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
4,1435120,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
5,1435193,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
6,1435169,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
7,1435278,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
8,1435227,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury
9,1435208,Pressure Injury Hub,https://www.healthhub.sg/programmes/pressure-i...,Menu - Pressure Injury Hub\n- Preventing Press...,program-sub-pages,pressure-injury


In [130]:
df_pressureInjury = df_pressureInjury.dropna(subset=["extracted_content_body"])
docs_pressureInjury = df_pressureInjury["extracted_content_body"].to_list()
doc_titles_pressureInjury = df_pressureInjury["title"].to_list()
len(docs_pressureInjury)

[1;36m13[0m

### Without hyperparameter tuning

In [131]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_pressureInjury)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_pressureInjury.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_pressureInjury, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [132]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_pressureInjury)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.067
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2,-1_disease_regularly ou_utrition hydration_se ...,"[disease, regularly ou, utrition hydration, se...",[Menu - Pressure Injury Hub\n- Preventing Pres...
1,0,5,0_aic_seniors mobility_caregiver_enabling fund,"[aic, seniors mobility, caregiver, enabling fu...",[Menu - Pressure Injury Hub\n- Preventing Pres...
2,1,2,1_layer_device related_shear medical_friction ...,"[layer, device related, shear medical, frictio...",[Menu - Pressure Injury Hub\n- Preventing Pres...
3,2,2,2_healthcare team_follow visits_wound healing_...,"[healthcare team, follow visits, wound healing...",[Menu - Pressure Injury Hub\n- Preventing Pres...
4,3,2,3_fever_itchy swollen_pus discharges_sore itchy,"[fever, itchy swollen, pus discharges, sore it...",[Menu - Pressure Injury Hub\n- Preventing Pres...


In [133]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_pressureInjury, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_pressureInjury_tuned.csv")
visualization.show()

## 6. Nurition Hub

In [56]:
df_nuritionHub = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="nutrition_hub")
df_nuritionHub

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1435018,Reduce Your Salt And Sugar Intake,https://www.healthhub.sg/programmes/nutrition-...,Menu [#] [#clear]\n\nMenu [#] [#clear]\n\n...,program-sub-pages,nutrition-hub
1,1472348,Nutri-Grade,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nMenu [#clear]\n\nMenu [...,program-sub-pages,nutrition-hub
2,1435021,Make Healthy Food & Grocery Choices,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nResources\nPick up useful t...,program-sub-pages,nutrition-hub
3,1435017,Nutritious Foods For A Healthy Diet,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nEat More\nEat more nutritio...,program-sub-pages,nutrition-hub
4,1435014,Easy Healthy Recipes,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nRecipes\nNeed a little culi...,program-sub-pages,nutrition-hub
5,1435240,"Eat, Drink, Shop Healthy Challenge",https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nMenu [#clear]\n\nMenu [...,program-sub-pages,nutrition-hub
6,1435223,Nutri-Grade Directory,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nMenu [#clear]\n\nMenu [...,program-sub-pages,nutrition-hub
7,1497058,BMI and Calorie Calculator,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nMenu [#clear]\n\nMenu [...,program-sub-pages,nutrition-hub
8,1435023,Healthy Lifestyle Programmes to Live Healthier,https://www.healthhub.sg/programmes/nutrition-...,"Menu [#clear]\n\nA Healthier, Rewarding Life...",program-sub-pages,nutrition-hub
9,1499999,BMI and Calorie Calculator,https://www.healthhub.sg/programmes/nutrition-...,Menu [#clear]\n\nMenu [#clear]\n\nMenu [...,program-sub-pages,nutrition-hub


In [57]:
df_nuritionHub = df_nuritionHub.dropna(subset=["extracted_content_body"])
docs_nuritionHub = df_nuritionHub["extracted_content_body"].to_list()
doc_titles_nuritionHub = df_nuritionHub["title"].to_list()
len(docs_nuritionHub)

[1;36m11[0m

### Without hyperparameter tuning

In [58]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_nuritionHub)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_nuritionHub.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_nuritionHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [59]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_nuritionHub)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.157
Best parameters: {'min_cluster_size': 2, 'min_samples': 2, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3,-1_steps challengetm_national steps_challenget...,"[steps challengetm, national steps, challenget...",[Menu [#clear]\n\nMenu [#clear]\n\nMenu ...
1,0,3,0_sodium intake_grade labelling_menus_sweeteners,"[sodium intake, grade labelling, menus, sweete...",[Menu [#clear]\n\nMenu [#clear]\n\nMenu ...
2,1,3,1_qualifying_stamp_stamps healthpoints_qr ticket,"[qualifying, stamp, stamps healthpoints, qr ti...",[Menu [#clear]\n\nMenu [#clear]\n\nMenu ...
3,2,2,2_kcals_gain kcals_maintenance kcals_frozen fruit,"[kcals, gain kcals, maintenance kcals, frozen ...",[Menu [#clear]\n\nMenu [#clear]\n\nMenu ...


In [60]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_nuritionHub, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_nuritionHub_tuned.csv")
visualization.show()

## 7. Let's Move It

In [61]:
df_LetsMoveIt = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="LetsMoveIt_concatenated")
df_LetsMoveIt

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1480345,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,[https://go.gov.sg/useh365] [https://go.gov.s...,program-sub-pages,LetsMoveIt
1,1435259,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,Your quick guide to book an event\nFor a pdf v...,program-sub-pages,LetsMoveIt
2,1435239,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,Ready for a bigger challenge? Explore a variet...,program-sub-pages,LetsMoveIt
3,1435229,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,Well done on taking the first steps to an acti...,program-sub-pages,LetsMoveIt
4,1435235,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,"Now that youre prepared to get active, complet...",program-sub-pages,LetsMoveIt
5,1435233,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,"Learn how to set exercise goals, stay motivate...",program-sub-pages,LetsMoveIt
6,1467163,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,What is the Singapore Physical Activity Guidel...,program-sub-pages,LetsMoveIt
7,1498711,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/LetsMoveIt...,What is the MOVE IT partner endorsement progra...,program-sub-pages,LetsMoveIt
8,1435347,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/MoveIt/mov...,"Starting 12 July 2021, participants who do not...",program-sub-pages,MoveIt
9,1435340,Great things start when you MOVE IT!,https://www.healthhub.sg/programmes/MoveIt/mov...,"Starting 12 July 2021, participants who do not...",program-sub-pages,MoveIt


In [62]:
df_LetsMoveIt = df_LetsMoveIt.dropna(subset=["extracted_content_body"])
docs_LetsMoveIt = df_LetsMoveIt["extracted_content_body"].to_list()
doc_titles_LetsMoveIt = df_LetsMoveIt["title"].to_list()
len(docs_LetsMoveIt)

[1;36m12[0m

### Without hyperparameter tuning

In [63]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_LetsMoveIt)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_LetsMoveIt.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_LetsMoveIt, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [64]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_LetsMoveIt)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.217
Best parameters: {'min_cluster_size': 3, 'min_samples': 1, 'cluster_selection_method': 'leaf', 'metric': 'manhattan'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1,-1_activity learn_benefits physical_learn trut...,"[activity learn, benefits physical, learn trut...",[Well done on taking the first steps to an act...
1,0,4,0_outline week_fitness assessment_assessment_a...,"[outline week, fitness assessment, assessment,...","[Learn how to set exercise goals, stay motivat..."
2,1,4,1_sedentary behaviour_engaging_exercise progra...,"[sedentary behaviour, engaging, exercise progr...",[In line with MOHs advisory on\n20 July 2021\n...
3,2,3,2_classes activities_dance aerobics_activities...,"[classes activities, dance aerobics, activitie...","[Starting 12 July 2021, participants who do no..."


In [65]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_LetsMoveIt, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_LetsMoveIt_tuned.csv")
visualization.show()

## 8. AAP (Live Well, Age Well)

In [87]:
df_AAP = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="AAP")
df_AAP

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1434907,"See, Hear & Eat Better",https://www.healthhub.sg/programmes/AAP/functi...,"Project Silver Screen is an affordable, nation...",program-sub-pages,AAP
1,1434895,7 Easy Exercises to an Active Lifestyle,https://www.healthhub.sg/programmes/AAP/easy-e...,Back to Healthy Ageing [/programmes/Healthy_Ag...,program-sub-pages,AAP
2,1434903,You can spot a stroke,https://www.healthhub.sg/programmes/AAP/stroke/,Back to Healthy Ageing [/programmes/Healthy_Ag...,program-sub-pages,AAP
3,1434905,Age Healthier When You Cook Right And Eat Smart,https://www.healthhub.sg/programmes/AAP/nutrit...,Back to Healthy Ageing [http://www.healthhub.s...,program-sub-pages,AAP
4,1434901,You can prevent falls,https://www.healthhub.sg/programmes/AAP/falls-...,Back to Healthy Ageing [http://www.healthhub.s...,program-sub-pages,AAP
5,1434899,You Can Get Moving,https://www.healthhub.sg/programmes/AAP/sit-do...,Back to Healthy Ageing [/programmes/Healthy_Ag...,program-sub-pages,AAP
6,1434897,7 Easy Exercises to an Active Lifestyle (Step-...,https://www.healthhub.sg/programmes/AAP/easy-e...,Back to Healthy Ageing [/programmes/Healthy_Ag...,program-sub-pages,AAP


In [88]:
df_AAP = df_AAP.dropna(subset=["extracted_content_body"])
docs_AAP = df_AAP["extracted_content_body"].to_list()
doc_titles_AAP = df_AAP["title"].to_list()
len(docs_AAP)

[1;36m7[0m

### Without hyperparameter tuning

In [89]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_AAP)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_AAP.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_AAP, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [90]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_AAP)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.144
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,4,0_leg_calcium_stretch_meals,"[leg, calcium, stretch, meals, turun mp, moder...",[Back to Healthy Ageing [http://www.healthhub....
1,1,3,1_signs_sit exercises_functional screening_ris...,"[signs, sit exercises, functional screening, r...",[Back to Healthy Ageing [/programmes/Healthy_A...


In [92]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_AAP, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_AAP_tuned.csv")
visualization.show()

## 9. HH Parenting

In [135]:
df_hhParenting = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="healthhub__parenting")
df_hhParenting

Unnamed: 0,id,title,full_url,extracted_content_body,content_category,secondLvl
0,1434831,Be Gentle With Yourself,https://www.healthhub.sg/programmes/healthhub-...,Excited to hold your baby in your arms but fee...,program-sub-pages,healthhub--parenting
1,1434921,Happy 1-Year Old!,https://www.healthhub.sg/programmes/healthhub-...,Your baby is 1 year old! YouaTMve made it!\nFo...,program-sub-pages,healthhub--parenting
2,1434889,Is Your Baby Throwing Things Around?,https://www.healthhub.sg/programmes/healthhub-...,Cannot determine when your baby is cute or nau...,program-sub-pages,healthhub--parenting
3,1434873,Separation Anxiety,https://www.healthhub.sg/programmes/healthhub-...,Preparing to go back to work? Help baby cope w...,program-sub-pages,healthhub--parenting
4,1434911,"Time For Baby, Time For Mummy And Daddy",https://www.healthhub.sg/programmes/healthhub-...,"Is your baby anxious around strangers, clingin...",program-sub-pages,healthhub--parenting
5,1434847,Bonding With Baby,https://www.healthhub.sg/programmes/healthhub-...,Amazed at how baby is able to recognise you? H...,program-sub-pages,healthhub--parenting
6,1434863,Take Time To Care For Yourself,https://www.healthhub.sg/programmes/healthhub-...,"By now, your maternity leave will be over soon...",program-sub-pages,healthhub--parenting


In [136]:
df_hhParenting = df_hhParenting.dropna(subset=["extracted_content_body"])
docs_hhParenting = df_hhParenting["extracted_content_body"].to_list()
doc_titles_hhParenting = df_hhParenting["title"].to_list()
len(docs_hhParenting)

[1;36m7[0m

### Without hyperparameter tuning

In [137]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_hhParenting)

result = topic_model.get_topic_info()
# ⭐ CHANGE THE NAME OF THE FILE
result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_hhParenting.csv")

top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_hhParenting, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

### With hyperparameter tuning

In [138]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_hhParenting)
result = topic_model.get_topic_info()
result

Best DBCV score: 0.071
Best parameters: {'min_cluster_size': 2, 'min_samples': 1, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,5,0_family doctor_postnatal_postnatal depression...,"[family doctor, postnatal, postnatal depressio...",[Preparing to go back to work? Help baby cope ...
1,1,2,1_dealing baby_breastfeeding_baby blues_help h...,"[dealing baby, breastfeeding, baby blues, help...",[Excited to hold your baby in your arms but fe...


In [139]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_hhParenting, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_hhParenting_tuned.csv")
visualization.show()

<hr>

## Do not run from here onwards

The following groups are relatively smaller, hence there are errors encountered when performing Topic Modelling.

Instead, Keyword Modelling will be performed in another notebook named 'keyBert_programme_subpages.ipynb'

## 10. IQuit [Errors Encountered]

In [None]:
df_vaping= pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="iquit_concatenated")
df_vaping

In [77]:
df_vaping = df_vaping.dropna(subset=["extracted_content_body"])
docs_vaping = df_vaping["extracted_content_body"].to_list()
doc_titles_vaping = df_vaping["title"].to_list()
len(docs_vaping)

[1;36m6[0m

### Without hyperparameter tuning

In [78]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 1), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
try:
    topics, probs = topic_model.fit_transform(docs_vaping)
    result = topic_model.get_topic_info()
    # ⭐ CHANGE THE NAME OF THE FILE
    result.to_csv(r"bertopic program subpages\original\bertopic_programeSubPages_iquit.csv")

    top_topics = topic_model.get_topic_freq()["Topic"].tolist()
    embeddings = topic_model.umap_model.embedding_
    hover_data = [
        f"{title} - Topic {topic}"
        # ⭐ CHANGE THE NAME OF THE PARAM
        for title, topic in zip(doc_titles_vaping, topics)
    ]
    visualization = topic_model.visualize_documents(
        hover_data,
        reduced_embeddings=embeddings,
        topics=top_topics,
        title="Topics",
    )
    visualization.show()
except TypeError as e:
    print(f"TypeError occurred: {e}")
    # Optionally, you can add fallback logic here
except Exception as e:
    print(f"An error occurred: {e}")

TypeError occurred: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.


### With hyperparameter tuning

In [79]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=combined_stop_words, ngram_range=(1, 2), max_df=1)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

# ⭐ CHANGE THE NAME OF THE PARAM
topics, probs = topic_model.fit_transform(docs_vaping)
result = topic_model.get_topic_info()
result

In [150]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    # ⭐ CHANGE THE NAME OF THE PARAM
    for title, topic in zip(doc_titles_vaping, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
# ⭐ CHANGE THE NAME OF THE FILE
result.iloc[:,:-1].to_csv(r"bertopic program subpages\hyperparameter tuning\bertopic_programeSubPages_iquit_tuned.csv")
visualization.show()