Purpose of this notebook is to perform Keyword Modelling on the smaller groups from `iQuit` to `Get Healthhub Track`

Please run `group_program_subpages.ipynb` first to get the relevant excel file to run this notebook.

Please check that there's a folder named `bertopic program subpages` created in this `notebooks` folder in order to store the created excels later on. If not, there is a cell under `Setup` to create the folder.

<hr>

## Setup

In [1]:
%load_ext kedro.ipython

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hinat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [1]:
from typing import Any

import os
import nltk
import pandas as pd
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from pytictoc import TicToc

nltk.download("punkt_tab")

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hinat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
folder_path = 'bertopic program subpages'
if not os.path.exists(folder_path):
    # If it doesn't exist, create the folder
    os.makedirs(folder_path)
    print(f"Folder created: {folder_path}")
else:
    print(f"Folder already exists: {folder_path}")

Folder already exists: bertopic program subpages


<hr>

## KeyBert Template

In [4]:
def extract_keywords(
    merged_data: pd.DataFrame,
    cfg: dict[str, Any],
    only_confirmed_option: list[str],
    all_option: list[str],
    model: str,
    spacy_pipeline: str,
    stop_words: str,
    workers: int,
    use_mmr: bool,
    diversity: float,
    top_n: int,
) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model based on the provided parameters and
    return the DataFrame with the added `keybert_keywords` column containing the keywords.

    Args:
        merged_data (pd.DataFrame): The DataFrame containing the merged data.
        cfg (dict[str, Any]): The configuration dictionary containing the options to subset the merged data.
        only_confirmed_option (list[str]): The list of confirmed content categories if option is `only_confirmed`.
        all_option (list[str]): The list of all content categories if option is `all`.
        model (str): Use a custom embedding model. See https://maartengr.github.io/KeyBERT/guides/embeddings.html
        spacy_pipeline (str): The spaCy pipeline to be used for part-of-speech tagging. Standard is the 'en' pipeline.
        stop_words (str): The stop words to be used for keyphrase extraction.
        workers (int): Number of workers for spaCy part-of-speech tagging. To use all workers, set it to -1.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR) for keyphrase extraction.
        diversity (float): The diversity parameter for keyphrase extraction.
        top_n (int): The number of top keywords to extract.

    Returns:
         pd.DataFrame: The dataframe with the extracted keywords.
    """
    option = cfg["option"]
    contributor = cfg["contributor"]  # TODO: To allow for options other than HPB
    to_remove = cfg["to_remove"]

    # Subset the merged data based on the content categories provided as option
    if option == "only_confirmed":
        assert set(only_confirmed_option).issubset(
            set(all_option)
        ), "Invalid option(s). Please ensure selected content categories exist."
        filtered_data = merged_data.query("content_category in @only_confirmed_option")
    elif option == "all":
        filtered_data = merged_data.copy()
    else:
        assert (
            option in all_option
        ), "Invalid option. Please ensure selected content category exists."
        filtered_data = merged_data.query("content_category == @option")

    # To remove flagged articles or not and to subset by contributor
    if to_remove:
        filtered_data = filtered_data.query(
            f"pr_name == '{contributor}' and to_remove == {not to_remove}"
        ).reset_index(drop=True)
    else:
        filtered_data = filtered_data.query(f"pr_name == '{contributor}'").reset_index(
            drop=True
        )

    # Extract the raw content body text
    docs = filtered_data["extracted_content_body"].to_list()

    kw_model = KeyBERT(model)
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers
    )

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = filtered_data.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

<hr>

## 10. iQuit

In [10]:
def extract_keywords_vaping(
        df_vaping: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_vaping (pd.DataFrame): The DataFrame containing the vaping content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the vaping DataFrame
    docs = df_vaping["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = df_vaping.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the vaping data from the Excel sheet
df_vaping = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="iquit_concatenated")
df_vaping_with_keywords = extract_keywords_vaping(
    df_vaping=df_vaping,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_vaping_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_iquit.csv", index=False)

Elapsed time is 10.134201 seconds.


## 11. Korang Ok

In [11]:
def extract_keywords_korangok(
        df_korangok: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_korangok (pd.DataFrame): The DataFrame containing the korangok content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the korangok DataFrame
    docs = df_korangok["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = df_korangok.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the korangok data from the Excel sheet
df_korangok = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="korangok")
df_korangok_with_keywords = extract_keywords_korangok(
    df_korangok=df_korangok,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_korangok_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_korangok.csv", index=False)

Elapsed time is 7.158860 seconds.


## 12. Screen for Life

In [12]:
def extract_keywords_screenForLife(
        df_screenForLife: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_screenForLife (pd.DataFrame): The DataFrame containing the screenForLife content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the screenForLife DataFrame
    docs = df_screenForLife["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = df_screenForLife.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the screenForLife data from the Excel sheet
df_screenForLife = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="Screen_for_Life")
df_screenForLife_with_keywords = extract_keywords_screenForLife(
    df_screenForLife=df_screenForLife,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_screenForLife_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_screenForLife.csv", index=False)

Elapsed time is 7.755148 seconds.


## 13. Indian Outreach

In [13]:
def extract_keywords_indianOutreach(
        df_indianOutreach: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_indianOutreach (pd.DataFrame): The DataFrame containing the indianOutreach content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the indianOutreach DataFrame
    docs = df_indianOutreach["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = df_indianOutreach.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the indianOutreach data from the Excel sheet
df_indianOutreach = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="indian_outreach")
df_indianOutreach_with_keywords = extract_keywords_indianOutreach(
    df_indianOutreach=df_indianOutreach,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_indianOutreach_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_indianOutreach.csv", index=False)

Elapsed time is 4.187970 seconds.


## 14. Healthia

In [14]:
def extract_keywords_healthia(
        df_healthia: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_healthia (pd.DataFrame): The DataFrame containing the healthia content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the healthia DataFrame
    docs = df_healthia["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

    # We iterate through the keywords, and reverse the order of the keywords
    # from the closest to the most distant and taking only the keywords themselves,
    # ignoring the distances
    keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    # Store keywords in new column
    filtered_data_with_keywords = df_healthia.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the healthia data from the Excel sheet
df_healthia = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="healthia")
df_healthia_with_keywords = extract_keywords_healthia(
    df_healthia=df_healthia,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_healthia_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_healthia.csv", index=False)

Elapsed time is 3.004148 seconds.


## 15. HH Rewards

In [15]:
def extract_keywords_hhRewards(
        df_hhRewards: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_hhRewards (pd.DataFrame): The DataFrame containing the hhRewards content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the hhRewards DataFrame
    docs = df_hhRewards["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

        # Debugging information
        print(keywords)

     # Handle cases where the output might not be in the expected format
    if isinstance(keywords, (list, tuple)):
        if not isinstance(keywords[0], list):
            keywords = [keywords]  # Convert to a list of lists if necessary
        # Reverse order and extract only the keywords
        keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    else:
        print("Unexpected keywords format:", keywords)
        return df_hhRewards  # Return the original DataFrame if format is unexpected

    # Store keywords in new column
    filtered_data_with_keywords = df_hhRewards.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the hhRewards data from the Excel sheet
df_hhRewards = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="healthhub_rewards")
df_hhRewards_with_keywords = extract_keywords_hhRewards(
    df_hhRewards=df_hhRewards,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_hhRewards_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_hhRewards.csv", index=False)

[('redeem healthpoints', 0.6304), ('wallet summary page', 0.1626), ('sms confirmation', 0.1532), ('immediate family members', 0.1106), ('transportation cost', 0.0105)]
Elapsed time is 3.088720 seconds.


## 16. Get HH Track

In [16]:
def extract_keywords_hhTrack(
        df_hhTrack: pd.DataFrame, 
        model: str, 
        spacy_pipeline: str, 
        stop_words: str,
        workers: int, 
        use_mmr: bool, 
        diversity: float, 
        top_n: int
    ) -> pd.DataFrame:
    """
    Extract keywords using KeyBERT model and return DataFrame with the added `keybert_keywords` column.

    Args:
        df_hhTrack (pd.DataFrame): The DataFrame containing the hhTrack content.
        model (str): The embedding model to be used in KeyBERT.
        spacy_pipeline (str): The spaCy pipeline for part-of-speech tagging.
        stop_words (str): The stop words for keyphrase extraction.
        use_mmr (bool): Whether to use Maximal Marginal Relevance (MMR).
        diversity (float): Diversity parameter for MMR.
        top_n (int): Number of top keywords to extract.

    Returns:
        pd.DataFrame: DataFrame with extracted keywords.
    """
    
    # Extract the raw content body text from the hhTrack DataFrame
    docs = df_hhTrack["extracted_content_body"].to_list()
    
    # Initialize KeyBERT model
    kw_model = KeyBERT(model=model)

    # Initialize the vectorizer with the specified spaCy pipeline and stop words
    vectorizer = KeyphraseTfidfVectorizer(
        spacy_pipeline, stop_words=stop_words, workers=workers)

    # Marginally more performant
    # See: https://github.com/MaartenGr/KeyBERT/issues/156
    with TicToc():
        counts = vectorizer.fit(docs)
        vectorizer.fit = lambda *args, **kwargs: counts

        # If keyphrase vectorizer is specified, `keyphrase_ngram_range` is ignored
        keywords = kw_model.extract_keywords(
            docs,
            use_mmr=use_mmr,
            diversity=diversity,
            top_n=top_n,
            vectorizer=vectorizer,
        )

        # Debugging information
        print(keywords)

     # Handle cases where the output might not be in the expected format
    if isinstance(keywords, (list, tuple)):
        if not isinstance(keywords[0], list):
            keywords = [keywords]  # Convert to a list of lists if necessary
        # Reverse order and extract only the keywords
        keywords = [[kw[0] for kw in kws[::-1]] for kws in keywords]
    else:
        print("Unexpected keywords format:", keywords)
        return df_hhTrack  # Return the original DataFrame if format is unexpected

    # Store keywords in new column
    filtered_data_with_keywords = df_hhTrack.copy()
    filtered_data_with_keywords[f"keywords_{model}"] = keywords

    return filtered_data_with_keywords

# Load the hhTrack data from the Excel sheet
df_hhTrack = pd.read_excel(r"program-sub-pages\cleaned_grouped_programSubpages.xlsx", sheet_name="get_healthhub_track")
df_hhTrack_with_keywords = extract_keywords_hhTrack(
    df_hhTrack=df_hhTrack,
    model="all-MiniLM-L6-v2",  # Example model
    spacy_pipeline="en_core_web_sm",  # Example spaCy pipeline
    stop_words="english",  # Example stop words
    workers=1,
    use_mmr=True,  # Whether to use Maximal Marginal Relevance
    diversity=0.7,  # Diversity parameter for MMR
    top_n=5  # Number of top keywords to extract
)

# Save the result to CSV
df_hhTrack_with_keywords.to_csv(r"bertopic program subpages/original/keyBert_programSubPages_hhTrack.csv", index=False)

[('healthhub track login', 0.7359), ('new password', 0.4093), ('track', 0.3183), ('email', 0.3083), ('flexibility', 0.0437)]
Elapsed time is 1.071324 seconds.
