Purpose of this notebook is to perform Keyword Modelling on the smaller groups from `iQuit` to `Get Healthhub Track`

Please run `group_program_subpages.ipynb` first to get the relevant excel file to run this notebook.

Please check that there's a folder named `topic program subpages` created in this `notebooks` folder in order to store the created excels later on. If not, there is a cell under `Setup` to create the folder.

<hr>

## Setup

In [1]:
%load_ext kedro.ipython

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hinat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
from typing import Any

import os
import nltk
import pandas as pd
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from pytictoc import TicToc

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hinat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


[3;92mTrue[0m

In [7]:
folder_path = ['topic program subpages', 'topic program subpages/keyBert']
for folder in folder_path:
    if not os.path.exists(folder):
        # If it doesn't exist, create the folder
        os.makedirs(folder)
        print(f"Folder created: {folder}")
    else:
        print(f"Folder already exists: {folder}")

Folder already exists: topic program subpages
Folder created: topic program subpages/keyBert


<hr>

## extract_keywords Function

In [5]:
def extract_keywords(
        df: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df (pd.DataFrame): The DataFrame containing the nsc content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the nsc DataFrame
    docs = df["extracted_content_body"].to_list()
    combined_doc = ["".join(docs)]
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(combined_doc)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            combined_doc,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    print(keywords)

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [kw[0] for kw in keywords[::-1]]
    # Store keywords in new column
    filtered_data_with_keywords = df.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = [keywords] * len(df)

    return filtered_data_with_keywords

<hr>

## 4. NSC

In [8]:
# Load the nsc data from the Excel sheet
df_nsc = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="nsc_concatenated")
df_nsc_with_keywords = extract_keywords(
    df=df_nsc,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_nsc_with_keywords.to_excel(r"topic program subpages/keyBert/keyBert_programSubPages_nsc.xlsx", index=False)

Elapsed time is 28.291007 seconds.
[('wellness challenge', 0.4855), ('little more movement', 0.3441), ('camo 2nd ape tee fog essentials', 0.176), ('booking page', 0.1621), ('touch sensor', 0.1449), ('cabaran langkah', 0.1439), ('high blood pressure', 0.1226), ('general information', 0.1151), ('symptoms', 0.1023), ('incomplete scan progress', 0.0533)]


<hr>

## 5. Let's Move It

In [9]:
# Load the letsMoveIt data from the Excel sheet
df_letsMoveIt = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="LetsMoveIt_concatenated")
df_letsMoveIt_with_keywords = extract_keywords(
    df=df_letsMoveIt,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_letsMoveIt_with_keywords.to_excel(r"topic program subpages/keyBert/keyBert_programSubPages_letsMoveIt.xlsx", index=False)

Elapsed time is 18.627127 seconds.
[('singapore physical activity guidelines', 0.7504), ('functional balance', 0.2697), ('average age', 0.1871), ('permanent residents', 0.1401), ('excessive pregnancy', 0.1011), ('spags comprehensive information', 0.0193), ('start2move programme', 0.0169), ('different tabs', 0.0046), ('win', -0.0103), ('natural protein sources', -0.0272)]


<hr>

## 6. Nutrition Hub

In [13]:
# Load the nutritionHub data from the Excel sheet
df_nutritionHub = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="nutrition_hub_concatenated")
df_nutritionHub_with_keywords = extract_keywords(
    df=df_nutritionHub,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_nutritionHub_with_keywords.to_excel(r"topic program subpages/keyBert/keyBert_programSubPages_nutritionHub.xlsx", index=False)

Elapsed time is 17.139226 seconds.
[('sodium intake learn', 0.5219), ('pilot meal log challenge', 0.3749), ('nutribullet blender', 0.3733), ('caloric sweeteners', 0.3565), ('days bonus healthpoints', 0.2807), ('shop healthy passport', 0.1992), ('h365', 0.1445), ('menu labels', 0.1232), ('higher risks', 0.0867), ('scan qr code', 0.0377)]


<hr>

## 9. Screen for Life

In [15]:
# Load the screenForLife data from the Excel sheet
df_screenForLife = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="screen_for_life_concatenated")
df_screenForLife_with_keywords = extract_keywords(
    df=df_screenForLife,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_screenForLife_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_screenForLife.xlsx", index=False)

Elapsed time is 15.409471 seconds.
[('screening subsidies', 0.446), ('healthhub web portal', 0.3773), ('emails', 0.3227), ('childs birth information', 0.1887), ('inaccurate results', 0.1739), ('female singapore citizen', 0.1125), ('last mammogramcardiovascular risk', 0.0647), ('prev_step', 0.0392), ('persistent infection', -0.0047), ('heated thin loop wire', -0.0068)]


## 10. AAP

In [17]:
# Load the AAP data from the Excel sheet
df_AAP = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="AAP_concatenated")
df_AAP_with_keywords = extract_keywords(
    df=df_AAP,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_AAP_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_AAP.xlsx", index=False)

Elapsed time is 10.963290 seconds.
[('time singpass login', 0.5837), ('live assured', 0.2644), ('new app users', 0.2351), ('step guide', 0.1681), ('health outcomes', 0.1601), ('use ear', 0.1365), ('affordable cost', 0.1049), ('limit alcohol intake', 0.0848), ('stationary march', 0.0368), ('project silver screen', -0.0166)]


## 11. IQuit

In [18]:
# Load the vaping data from the Excel sheet
df_vaping = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="iquit_concatenated")
df_vaping_with_keywords = extract_keywords(
    df=df_vaping,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_vaping_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_iquit.xlsx", index=False)

Elapsed time is 10.167703 seconds.
[('quit programme', 0.4609), ('vape_view', 0.3545), ('national population health survey', 0.252), ('tob control', 0.1651), ('online first', 0.1303), ('main difference', 0.1032), ('warehouse raid cna', 0.0488), ('metal nanoparticles tin', 0.0386), ('sweating dizziness', 0.0371), ('causes infertility', 0.0342)]


## 11. Healthy 365

In [20]:
# Load the healthy365 data from the Excel sheet
df_healthy365 = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="healthy365_concatenated")
df_healthy365_with_keywords = extract_keywords(
    df=df_healthy365,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_healthy365_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_healthy365.xlsx", index=False)

Elapsed time is 5.733129 seconds.
[('time singpass login', 0.5398), ('enough healthpoints', 0.2978), ('simplygo mobile appsimplygo mobile app', 0.2552), ('multiple hpb evouchers', 0.2378), ('step instructions', 0.18), ('monday', 0.1696), ('retailer vouchers', 0.1486), ('activewallet pin', 0.1432), ('integrated transport hubs islandwide', 0.0166), ('unauthorised sources', -0.0664)]


## 12. Preventive Health

In [21]:
# Load the preventiveHealth data from the Excel sheet
df_preventiveHealth = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="preventive_health_concatenated")
df_preventiveHealth_with_keywords = extract_keywords(
    df=df_preventiveHealth,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_preventiveHealth_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_preventiveHealth.xlsx", index=False)

Elapsed time is 6.865510 seconds.
[('use hand sanitiser', 0.5414), ('health risks', 0.3096), ('peak flu season', 0.2996), ('pulmonary tb', 0.2799), ('faqs', 0.2106), ('hfmd spreads', 0.2037), ('gp clinics', 0.1619), ('ensure toys', 0.1158), ('public surfaces', 0.1114), ('metabolic', 0.04)]


## 13. STI

In [22]:
# Load the sti data from the Excel sheet
df_sti = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="sti_concatenated")
df_sti_with_keywords = extract_keywords(
    df=df_sti,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=10  # Number of top keywords to extract
)

# Save the result to CSV
df_sti_with_keywords.to_csv(r"topic program subpages/keyBert/keyBert_programSubPages_sti.xlsx", index=False)

Elapsed time is 3.367535 seconds.
[('hiv', 0.6392), ('implementation guide', 0.1447), ('weight loss', 0.1322), ('consistent treatment', 0.103), ('start', 0.0647), ('negative test', 0.0572), ('common surfaces', 0.0496), ('powerful impact', 0.0161), ('eligible singapore residents', -0.0313), ('different window periods', -0.0727)]
