In [None]:
%load_ext kedro.ipython

In [43]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance  # KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

In [7]:
df = catalog.load("merged_data")
content_category = "live-healthy-articles"
df = df.query(
    "content_category == @content_category"
).reset_index(drop=True)
df.head(2)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,extracted_content_body,l1_mappings,l2_mappings,page_views,engagement_rate,bounce_rate,exit_rate,scroll_percentage,percentage_total_views,cumulative_percentage_total_views
0,1444475,"Weight, BMI and Health Problems","Weight, BMI and Health Problems","Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/weight_p...,www.healthhub.sg/live-healthy/weight_putting_m...,weight_putting_me_at_risk_of_health_problems,What’s your Body Mass Index (BMI)? Learn how t...,"b'<div class=""ExternalClassE93BEC3784C545A286B...",...,What's a Healthy Body Mass Index?\nWe have all...,Well-being & Lifestyle,"Food, Diet and Nutrition",41994,0.688354,0.311646,0.491689,0.364433,0.018035,0.018035
1,1445137,7-month-baby Diet: An Authoritative Guide by O...,7-month-baby Diet: An Authoritative Guide by O...,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/meal-ide...,www.healthhub.sg/live-healthy/meal-ideas-month-7,meal-ideas-month-7,Your little one is now 7 months of age. Should...,"b'<div class=""ExternalClass46E64333542C4D8CBEA...",...,By Health Promotion Board in collaboration wit...,Well-being & Lifestyle,"Food, Diet and Nutrition",38812,0.693181,0.306819,0.870324,0.381544,0.016669,0.034704


In [23]:
df = df.dropna(subset=["extracted_content_body"])
docs = df["extracted_content_body"].to_list()
doc_titles = df["title"].to_list()
len(docs)

[1;36m1153[0m

## Without hyperparameters tuning

In [47]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=15,
    metric="euclidean",  # manhattan
    cluster_selection_method="leaf",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

In [48]:
topics, probs = topic_model.fit_transform(docs)

In [49]:
result = topic_model.get_topic_info()
result.to_csv("bertopic_live_healthy_ngram1_keybert.csv")

In [32]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    for title, topic in zip(doc_titles, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

## With hyperparameter tuning

In [65]:
embeddings = topic_model.umap_model.embedding_  # reduced embeddings from umap

best_score = 0

for min_cluster_size in range(2, 11):
    for min_samples in range(1, 11):
        for cluster_selection_method in ["eom", "leaf"]:
            for metric in ["euclidean", "manhattan"]:
                # for each combination of parameters of hdbscan
                hdb = HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=min_samples,
                    cluster_selection_method=cluster_selection_method,
                    metric=metric,
                    gen_min_span_tree=True,
                ).fit(embeddings)
                # DBCV score
                score = hdb.relative_validity_
                if score > best_score:
                    best_score = score
                    best_parameters = {
                        "min_cluster_size": min_cluster_size,
                        "min_samples": min_samples,
                        "cluster_selection_method": cluster_selection_method,
                        "metric": metric,
                    }

print(f"Best DBCV score: {best_score:.3f}")
print(f"Best parameters: {best_parameters}")

Best DBCV score: 0.337
Best parameters: {'min_cluster_size': 8, 'min_samples': 3, 'cluster_selection_method': 'eom', 'metric': 'manhattan'}


In [100]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    low_memory=False,
    random_state=42,
)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_samples=best_parameters["min_samples"],
    min_cluster_size=best_parameters["min_cluster_size"],
    metric=best_parameters["metric"], 
    cluster_selection_method=best_parameters["cluster_selection_method"],
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
# representation_model = KeyBERTInspired()
representation_model = MaximalMarginalRelevance(diversity=0.3)

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    # nr_topics=15,  # default is none, will auto reduce topics using HDBSCAN
)

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [103]:
result = topic_model.get_topic_info()
result

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,218,-1_healthier_rice_vegetables_healthy,"[healthier, rice, vegetables, healthy, healthh...",[Weight management - it's easier than you thin...
1,0,51,0_stress_esteem_self esteem_resilience,"[stress, esteem, self esteem, resilience, anxi...",[What is Puberty?\nWhen your child starts to m...
2,1,49,1_type diabetes_blood sugar_gdm_tea,"[type diabetes, blood sugar, gdm, tea, gestati...",[#1 Raw and Brown Sugar Are Not Healthier\nWit...
3,2,42,2_influenza_vaccines_fever_rubella,"[influenza, vaccines, fever, rubella, vaccinat...",[By Health Promotion Board in collaboration wi...
4,3,37,3_screen time_activities_children_skills,"[screen time, activities, children, skills, ga...",[The lack of outdoor activity among children c...
5,4,36,4_tbsp_sauce_fry_serve,"[tbsp, sauce, fry, serve, garlic, chopped, oni...",[By KK Womens and Childrens Hospital and Ms He...
6,5,32,5_dental_tooth decay_baby teeth_gums,"[dental, tooth decay, baby teeth, gums, caviti...",[Your child should have a full set of baby tee...
7,6,29,6_bmi_obesity_obese_percentile,"[bmi, obesity, obese, percentile, abdominal ob...",[A high Body Mass Index (BMI) suggests that yo...
8,7,29,7_quit smoking_withdrawal_milestone_nrt,"[quit smoking, withdrawal, milestone, nrt, cra...",[1. Gear Up to Quit Smoking\nIf you haven't al...
9,8,28,8_hawker_dishes_healthy plate_rice,"[hawker, dishes, healthy plate, rice, eat, qua...","[Do you fancy having a healthier, fitter body?..."


In [104]:
top_topics = topic_model.get_topic_freq()["Topic"].tolist()
embeddings = topic_model.umap_model.embedding_
hover_data = [
    f"{title} - Topic {topic}"
    for title, topic in zip(doc_titles, topics)
]
visualization = topic_model.visualize_documents(
    hover_data,
    reduced_embeddings=embeddings,
    topics=top_topics,
    title="Topics",
)
visualization.show()

In [106]:
result.iloc[:,:-1].to_csv("bertopic_output_livehealthy.csv")