In [1]:
%load_ext kedro.ipython

In [2]:
import re
import unicodedata

from bs4 import BeautifulSoup

In [3]:
def clean_text(text: str) -> str:
    # Normalize Unicode characters
    text = unicodedata.normalize("NFKD", text)

    # Replace common problematic characters
    text = text.replace("\xa0", " ")  # non-breaking space
    text = text.replace("\u200b", "")  # zero-width space
    text = text.replace("\u2028", "\n")  # line separator
    text = text.replace("\u2029", "\n")  # paragraph separator
    # Replace multiple whitespace with single space
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def extract_content(html_content: str) -> str:
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <br> tags and replace them with newline
    for br in soup.find_all("br"):
        br.replace_with("\n")

    related_sections = []
    read_these_next_ul = None
    # Extract "Related:" sections and "Read these next:" items
    for tag in soup.find_all(["p", "ul"]):
        if tag.name == "p" and tag.find("strong"):
            if "Related:" in tag.text:
                related_sections.append(re.sub(r"Related: ", "", clean_text(tag.text)))
            elif "Read these next:" in tag.text:
                read_these_next_ul = tag.find_next_sibling("ul")
        elif tag == read_these_next_ul:
            for li in tag.find_all("li"):
                related_sections.append(clean_text(li.text))

    # Extract the main content
    content = []
    for tag in soup.find_all(["h2", "h3", "h4", "p", "ul", "ol"]):
        if tag.name in ["h2", "h3", "h4"]:
            content.append(clean_text(tag.text))

        elif tag.name == "p":
            # Remove all em tags
            for em in tag.find_all("em"):
                em.extract()

            # Get the remaining text
            text = tag.get_text()

            # Remove sentences about HealthHub app, Google Play, and Apple Store
            if not re.search(
                r"(HealthHub app|Google Play|Apple Store|Parent Hub)", text
            ):
                if tag.find("strong"):
                    if "Related:" in tag.text:
                        text = clean_text(tag.text)
                        content.append(re.sub(r"\n", " ", text))

                    elif "Read these next:" in tag.text:
                        content.append(clean_text(tag.text))
                else:
                    content.append(clean_text(text))

        # For unordered lists
        elif (
            tag.name == "ul" and tag.parent.name == "div"
        ):  # not "ul" so we avoid duplicates
            for li in tag.find_all("li"):
                content.append("- " + clean_text(li.text))

        # For ordred lists
        elif tag.name == "ol":
            for i, li in enumerate(tag.find_all("li")):
                content.append(f"{i + 1}. " + clean_text(li.text))

        content.append("")  # Add a blank line after each element

    # Remove empty strings from content
    content = [c for c in content if c]

    return related_sections, "\n".join(content)

In [4]:
# ruff: noqa: F821
catalog.list()


[1m[[0m
    [32m'all_contents'[0m,
    [32m'all_contents_extracted'[0m,
    [32m'all_extracted_text'[0m,
    [32m'parameters'[0m,
    [32m'params:content_category'[0m,
    [32m'params:columns_to_keep'[0m,
    [32m'params:columns_to_keep.cost-and-financing'[0m,
    [32m'params:columns_to_keep.diseases-and-conditions'[0m,
    [32m'params:columns_to_keep.health-statistics'[0m,
    [32m'params:columns_to_keep.live-healthy-articles'[0m,
    [32m'params:columns_to_keep.medical-care-and-facilities'[0m,
    [32m'params:columns_to_keep.medications'[0m,
    [32m'params:columns_to_keep.program-sub-pages'[0m,
    [32m'params:columns_to_keep.programs'[0m,
    [32m'params:columns_to_keep.support-group-and-others'[0m,
    [32m'params:metadata'[0m,
    [32m'params:metadata.cost-and-financing'[0m,
    [32m'params:metadata.cost-and-financing.content_title'[0m,
    [32m'params:metadata.cost-and-financing.content_body'[0m,
    [32m'params:metadata.diseases-and-con

### Load all the contents

In [5]:
# ruff: noqa: F821
all_contents = catalog.load("all_contents")
all_contents


[1m{[0m
    [32m'export-published-cost-and-financing_14062024_data.xlsx'[0m: [1m<[0m[1;95mbound[0m[39m method AbstractVersionedDataset.load of <kedro_datasets.pandas.excel_dataset.ExcelDataset object at [0m[1;36m0x1679cffb0[0m[39m>>,[0m
[39m    [0m[32m'export-published-diseases-and-conditions_13062024_data.xlsx'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.excel_dataset.ExcelDataset object at [0m[1;36m0x1679cf3e0[0m[39m>>,[0m
[39m    [0m[32m'export-published-health-statistics_14062024_data.xlsx'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.excel_dataset.ExcelDataset object at [0m[1;36m0x1679ce9f0[0m[39m>>,[0m
[39m    [0m[32m'export-published-live-healthy-articles_14062024_data.xlsx'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.excel_dataset.ExcelDataset object at [0m[1;36m0x167a48c20[0m[39m>>,[0m
[39m    [0m[32m'export-published-medical-care

### Load the content category

In [6]:
# ruff: noqa: F821
content_category = catalog.load("params:content_category")
content_category

[32m'live-healthy-articles'[0m

### Use `content_category` as key to access the corresponding content

In [7]:
df = all_contents[f"export-published-{content_category}_14062024_data.xlsx"]()
# Drop all columns which have only null values
df = df.dropna(axis=1, how="all")
print(df.shape)

(1155, 60)


### View all columns to keep for corresponding content category

In [8]:
# ruff: noqa: F821
columns_to_keep = catalog.load("params:columns_to_keep")
columns_to_keep


[1m{[0m
    [32m'cost-and-financing'[0m: [1m[[0m
        [32m'id'[0m,
        [32m'Content.Name'[0m,
        [32m'CostAndFinancing_Title'[0m,
        [32m'CostAndFinancing_ArticleCatNames'[0m,
        [32m'CostAndFinancing_CoverImgUrl'[0m,
        [32m'CostAndFinancing_FullUrl'[0m,
        [32m'CostAndFinancing_FullUrl2'[0m,
        [32m'CostAndFinancing_FriendlyUrl'[0m,
        [32m'CostAndFinancing_CategoryDesc'[0m,
        [32m'CostAndFinancing_ContentBody'[0m,
        [32m'CostAndFinancing_ENKeywords'[0m,
        [32m'CostAndFinancing_FeatureTitle'[0m,
        [32m'CostAndFinancing_PRName'[0m,
        [32m'CostAndFinancing_AlternateImageText'[0m,
        [32m'CostAndFinancing_DateModified'[0m,
        [32m'CostAndFinancing_NumberofViews'[0m,
        [32m'CostAndFinancing_LastMonthViewCount'[0m,
        [32m'CostAndFinancing_LastTwoMonthsView'[0m,
        [32m'Page Views'[0m,
        [32m'Engagement Rate'[0m,
        [32m'Bounce Rate'

### Get columns to drop corresponding to `content_category`

In [9]:
relevant_columns = columns_to_keep[content_category]
relevant_columns


[1m[[0m
    [32m'id'[0m,
    [32m'Content.Name'[0m,
    [32m'LiveHealthyArticle_Title'[0m,
    [32m'LiveHealthyArticle_ArticleCateName'[0m,
    [32m'LiveHealthyArticle_CoverImgUrl'[0m,
    [32m'LiveHealthyArticle_FullUrl'[0m,
    [32m'LiveHealthyArticle_FullUrl2'[0m,
    [32m'LiveHealthyArticle_FriendlyUrl'[0m,
    [32m'LiveHealthyArticle_CategoryDes'[0m,
    [32m'LiveHealthyArticle_Content_Body'[0m,
    [32m'LiveHealthyArticle_ENKeywords'[0m,
    [32m'LiveHealthyArticle_FeatureTitle'[0m,
    [32m'LiveHealthyArticle_PRName'[0m,
    [32m'LiveHealthyArticle_AlternateImageText'[0m,
    [32m'LiveHealthyArticle_DateModified'[0m,
    [32m'LiveHealthyArticle_Number_of_View'[0m,
    [32m'LiveHealthyArticle_Lastmonthview'[0m,
    [32m'LiveHealthyArticle_LastTwoMonthsView'[0m,
    [32m'Page Views'[0m,
    [32m'Engagement Rate'[0m,
    [32m'Bounce Rate'[0m,
    [32m'Exit Rate'[0m,
    [32m'Scroll %'[0m,
    [32m'% of Total Views'[0m,
    [32m'Cu

In [10]:
# Keep all relevant columns
df = df[relevant_columns]
print(df.shape)

(1155, 25)


In [11]:
# Remaining columns
df.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'id'[0m, [32m'Content.Name'[0m, [32m'LiveHealthyArticle_Title'[0m,
       [32m'LiveHealthyArticle_ArticleCateName'[0m, [32m'LiveHealthyArticle_CoverImgUrl'[0m,
       [32m'LiveHealthyArticle_FullUrl'[0m, [32m'LiveHealthyArticle_FullUrl2'[0m,
       [32m'LiveHealthyArticle_FriendlyUrl'[0m, [32m'LiveHealthyArticle_CategoryDes'[0m,
       [32m'LiveHealthyArticle_Content_Body'[0m, [32m'LiveHealthyArticle_ENKeywords'[0m,
       [32m'LiveHealthyArticle_FeatureTitle'[0m, [32m'LiveHealthyArticle_PRName'[0m,
       [32m'LiveHealthyArticle_AlternateImageText'[0m,
       [32m'LiveHealthyArticle_DateModified'[0m, [32m'LiveHealthyArticle_Number_of_View'[0m,
       [32m'LiveHealthyArticle_Lastmonthview'[0m,
       [32m'LiveHealthyArticle_LastTwoMonthsView'[0m, [32m'Page Views'[0m, [32m'Engagement Rate'[0m,
       [32m'Bounce Rate'[0m, [32m'Exit Rate'[0m, [32m'Scroll %'[0m, [32m'% of Total Views'[0m,
       [32m'C

### Drop all articles with no content

In [12]:
df = df[
    df["LiveHealthyArticle_Content_Body"].apply(
        lambda x: True if re.search(r"(<[div|p|h2].*?>)", x) else False
    )
].reset_index(drop=True)
print(df.shape)

(1149, 25)


In [13]:
tmp = df.sample(5, random_state=42).reset_index(drop=True)
tmp

Unnamed: 0,id,Content.Name,LiveHealthyArticle_Title,LiveHealthyArticle_ArticleCateName,LiveHealthyArticle_CoverImgUrl,LiveHealthyArticle_FullUrl,LiveHealthyArticle_FullUrl2,LiveHealthyArticle_FriendlyUrl,LiveHealthyArticle_CategoryDes,LiveHealthyArticle_Content_Body,...,LiveHealthyArticle_Number_of_View,LiveHealthyArticle_Lastmonthview,LiveHealthyArticle_LastTwoMonthsView,Page Views,Engagement Rate,Bounce Rate,Exit Rate,Scroll %,% of Total Views,Cumulative % of Total Views
0,1443022,How To Identify And Deal With Depression,How To Identify And Deal With Depression,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how-to-i...,www.healthhub.sg/live-healthy/how-to-identify-...,how-to-identify-and-deal-with-depression,"Learn how to spot the signs of depression, get...","<div class=""ExternalClass46FF4530AAF64858A79C3...",...,27637.0,1100.0,1947.0,1622,0.863296,0.136704,0.549711,0.364365,0.001371,0.575815
1,1444928,Signs of Stress: Could Stress Be Good for You?,Signs of Stress: Could Stress Be Good for You?,"Mind and Balance,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/stress_c...,www.healthhub.sg/live-healthy/stress_can_be_go...,stress_can_be_good_for_you,"Stressful situations, whether at school or at ...","<div class=""ExternalClassA58275AB1DD8443085927...",...,1468.0,129.0,159.0,331,0.883784,0.116216,0.0,0.306647,0.00028,0.938865
2,1442901,Coping with your two-year-old: when a firm han...,Coping with your two-year-old: when a firm han...,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/baby-cop...,www.healthhub.sg/live-healthy/baby-coping-with...,baby-coping-with-your-2-year-old-when-a-firm-h...,Your two-year-old can be a handful. But here a...,"<div class=""ExternalClass5E6AB2968188425D9CEDB...",...,295.0,42.0,81.0,650,0.808219,0.191781,0.583815,0.366923,0.000549,0.825824
3,1445194,Common Eye Conditions in Your Pre-Schooler,Common Eye Conditions in Your Pre-Schooler,"Body Care,Child and Teen Health,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/eye-care...,www.healthhub.sg/live-healthy/eye-care-for-pre...,eye-care-for-pre-schooler,Learn about the common eye conditions your chi...,"<div class=""ExternalClass4D426A5EDDCF40BA8C223...",...,0.0,0.0,0.0,1244,0.831234,0.168766,0.531587,0.398513,0.001051,0.655277
4,1445895,Managing Wandering Dementia Patients,Managing Wandering Dementia Patients,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/managing...,www.healthhub.sg/live-healthy/managing-wanderi...,managing-wandering-behaviour-in-dementia,One of your family members has just been diagn...,"<div class=""ExternalClassB401DB0539BD4790885E5...",...,712.0,21.0,32.0,413,0.891648,0.108352,0.199074,0.336562,0.000349,0.912451


### Get metadata for corresponding content category

In [14]:
# ruff: noqa: F821
metadata = catalog.load("params:metadata")
metadata


[1m{[0m
    [32m'cost-and-financing'[0m: [1m{[0m
        [32m'content_title'[0m: [32m'CostAndFinancing_Title'[0m,
        [32m'content_body'[0m: [32m'CostAndFinancing_ContentBody'[0m
    [1m}[0m,
    [32m'diseases-and-conditions'[0m: [1m{[0m
        [32m'content_title'[0m: [32m'DiseasesCondition_Title'[0m,
        [32m'content_body'[0m: [32m'DiseasesCondition_ContentBody'[0m
    [1m}[0m,
    [32m'health-statistics'[0m: [1m{[0m
        [32m'content_title'[0m: [32m'HealthStatistics_Title'[0m,
        [32m'content_body'[0m: [32m'HealthStatistics_ContentBody'[0m
    [1m}[0m,
    [32m'live-healthy-articles'[0m: [1m{[0m
        [32m'content_title'[0m: [32m'LiveHealthyArticle_Title'[0m,
        [32m'content_body'[0m: [32m'LiveHealthyArticle_Content_Body'[0m
    [1m}[0m,
    [32m'medical-care-and-facilities'[0m: [1m{[0m
        [32m'content_title'[0m: [32m'MedicalCareFaci_Title'[0m,
        [32m'content_body'[0m: [32m'Medica

In [15]:
content_title = metadata[content_category]["content_title"]
content_body = metadata[content_category]["content_body"]

print(content_title)
print(content_body)

LiveHealthyArticle_Title
LiveHealthyArticle_Content_Body


In [16]:
tmp["related_sections"] = None
tmp["extracted_content_body"] = None

for index, row in tmp.iterrows():
    title = row[content_title]
    print(title)

    html_content = row[content_body]
    related_sections, extracted_content_body = extract_content(html_content)

    tmp.at[index, "related_sections"] = related_sections
    tmp.at[index, "extracted_content_body"] = extracted_content_body

How To Identify And Deal With Depression
Signs of Stress: Could Stress Be Good for You?
Coping with your two-year-old: when a firm hand is needed
Common Eye Conditions in Your Pre-Schooler
Managing Wandering Dementia Patients


In [17]:
tmp.iloc[0]["related_sections"]


[1m[[0m
    [32m'Myths and Misconceptions About Depression'[0m,
    [32m'Coping with Depression'[0m,
    [32m'Be a Master of Stress'[0m,
    [32m'Feeling Depressed? You Are Not Alone'[0m,
    [32m'Helping Youth Deal With Depression'[0m,
    [32m'Myths and Misconceptions About Depression'[0m
[1m][0m

In [18]:
tmp.iloc[0]["extracted_content_body"]

[32m'Signs of Depression\nIt is normal to feel sad when we lose a loved one, fail an exam, or even end a relationship. However, when feelings of sadness is prolonged and affects our daily lives, it is symptomatic of depression.\nOther signs of depression include:\n- A loss of interest in activities previously enjoyed\n- Weight loss or weight gain; or decrease or increase in appetite\n- Difficulty falling asleep or staying asleep; or sleeping excessively\n- Feeling agitated or restless\n- Feeling tired and lacking the energy\n- Feelings of worthlessness or excessive guilt\n- Difficulty concentrating or having trouble thinking\n- Frequent thoughts of death or suicide\nRelated: Myths and Misconceptions About Depression\nGetting Help\nIf you have been constantly feeling sad and experiencing some or all of the symptoms above, it is appropriate to seek professional medical help, especially so if thoughts of death or suicide are present. Do not self-diagnose or self-manage without speaking t

In [19]:
tmp

Unnamed: 0,id,Content.Name,LiveHealthyArticle_Title,LiveHealthyArticle_ArticleCateName,LiveHealthyArticle_CoverImgUrl,LiveHealthyArticle_FullUrl,LiveHealthyArticle_FullUrl2,LiveHealthyArticle_FriendlyUrl,LiveHealthyArticle_CategoryDes,LiveHealthyArticle_Content_Body,...,LiveHealthyArticle_LastTwoMonthsView,Page Views,Engagement Rate,Bounce Rate,Exit Rate,Scroll %,% of Total Views,Cumulative % of Total Views,related_sections,extracted_content_body
0,1443022,How To Identify And Deal With Depression,How To Identify And Deal With Depression,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how-to-i...,www.healthhub.sg/live-healthy/how-to-identify-...,how-to-identify-and-deal-with-depression,"Learn how to spot the signs of depression, get...","<div class=""ExternalClass46FF4530AAF64858A79C3...",...,1947.0,1622,0.863296,0.136704,0.549711,0.364365,0.001371,0.575815,"[Myths and Misconceptions About Depression, Co...",Signs of Depression\nIt is normal to feel sad ...
1,1444928,Signs of Stress: Could Stress Be Good for You?,Signs of Stress: Could Stress Be Good for You?,"Mind and Balance,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/stress_c...,www.healthhub.sg/live-healthy/stress_can_be_go...,stress_can_be_good_for_you,"Stressful situations, whether at school or at ...","<div class=""ExternalClassA58275AB1DD8443085927...",...,159.0,331,0.883784,0.116216,0.0,0.306647,0.00028,0.938865,"[_x000D_ Exam Stress Busting Tips, _x000D_ 6 M...",Stress Can Be Your Friend\nWhen examinations a...
2,1442901,Coping with your two-year-old: when a firm han...,Coping with your two-year-old: when a firm han...,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/baby-cop...,www.healthhub.sg/live-healthy/baby-coping-with...,baby-coping-with-your-2-year-old-when-a-firm-h...,Your two-year-old can be a handful. But here a...,"<div class=""ExternalClass5E6AB2968188425D9CEDB...",...,81.0,650,0.808219,0.191781,0.583815,0.366923,0.000549,0.825824,[],How to Deal with Two Year Old Tantrums?\nAs ba...
3,1445194,Common Eye Conditions in Your Pre-Schooler,Common Eye Conditions in Your Pre-Schooler,"Body Care,Child and Teen Health,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/eye-care...,www.healthhub.sg/live-healthy/eye-care-for-pre...,eye-care-for-pre-schooler,Learn about the common eye conditions your chi...,"<div class=""ExternalClass4D426A5EDDCF40BA8C223...",...,0.0,1244,0.831234,0.168766,0.531587,0.398513,0.001051,0.655277,"[Healthy Eyes, Clear Vision, Different Spectac...","In Singapore, children are also becoming myopi..."
4,1445895,Managing Wandering Dementia Patients,Managing Wandering Dementia Patients,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/managing...,www.healthhub.sg/live-healthy/managing-wanderi...,managing-wandering-behaviour-in-dementia,One of your family members has just been diagn...,"<div class=""ExternalClassB401DB0539BD4790885E5...",...,32.0,413,0.891648,0.108352,0.199074,0.336562,0.000349,0.912451,[My Loved One Has Dementia. What Do I Do?],Wandering is common among people with dementia...
