## Import Packages

In [1]:
# Install required packages

# !pip install openpyxl beautifulsoup4 lxml html5lib
# !pip install htmlmin

In [2]:
# Import relevant packages

import json
import os
import re
import zipfile
from io import StringIO
from unicodedata import normalize

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# import htmlmin

## Get .xlsx files from ZIP archive

### Utility Functions

In [3]:
# Create zip archive


def create_archive(zip_path):
    archive = zipfile.ZipFile(zip_path, "r")
    return archive

In [4]:
# Get xlsx files from archive


def get_xlsx_from_archive(archive):
    files_and_dirs = archive.namelist()
    xlsx_files = list(filter(lambda k: k.split(".")[-1] == "xlsx", files_and_dirs))
    return xlsx_files

In [5]:
# Get file name


def get_fname(fpath):
    fname = fpath.split(".")[0].split("/")[-1]
    return fname

### Procedure

In [6]:
# Open zip file and extract directories and files


archive = create_archive("../data/GenAI - Full Content Export.zip")
files_and_dirs = archive.namelist()

print(files_and_dirs)

['Content Export with Data/', 'Content Export with Data/export-published-cost-and-financing_14062024_data.xlsx', 'Content Export with Data/export-published-diseases-and-conditions_13062024_data.xlsx', 'Content Export with Data/export-published-health-statistics_14062024_data.xlsx', 'Content Export with Data/export-published-live-healthy-articles_14062024_data.xlsx', 'Content Export with Data/export-published-medical-care-and-facilities_14062024_data.xlsx', 'Content Export with Data/export-published-medications_14062024_data.xlsx', 'Content Export with Data/export-published-program-sub-pages_14062024_data.xlsx', 'Content Export with Data/export-published-programs_14062024_data.xlsx', 'Content Export with Data/export-published-support-group-and-others_14062024_data.xlsx']


In [7]:
# Get xlsx files from archive

archive = create_archive("../data/GenAI - Full Content Export.zip")
xlsx_files = get_xlsx_from_archive(archive)

print(xlsx_files)

['Content Export with Data/export-published-cost-and-financing_14062024_data.xlsx', 'Content Export with Data/export-published-diseases-and-conditions_13062024_data.xlsx', 'Content Export with Data/export-published-health-statistics_14062024_data.xlsx', 'Content Export with Data/export-published-live-healthy-articles_14062024_data.xlsx', 'Content Export with Data/export-published-medical-care-and-facilities_14062024_data.xlsx', 'Content Export with Data/export-published-medications_14062024_data.xlsx', 'Content Export with Data/export-published-program-sub-pages_14062024_data.xlsx', 'Content Export with Data/export-published-programs_14062024_data.xlsx', 'Content Export with Data/export-published-support-group-and-others_14062024_data.xlsx']


In [8]:
# Get file names from archive

for xlsx in xlsx_files:
    fname = get_fname(xlsx)
    print(fname)

export-published-cost-and-financing_14062024_data
export-published-diseases-and-conditions_13062024_data
export-published-health-statistics_14062024_data
export-published-live-healthy-articles_14062024_data
export-published-medical-care-and-facilities_14062024_data
export-published-medications_14062024_data
export-published-program-sub-pages_14062024_data
export-published-programs_14062024_data
export-published-support-group-and-others_14062024_data


## Naive Exploratory Data Analysis (Single .xlsx file)

In [9]:
file = archive.open(xlsx_files[0])

### Display .xlsx as Pandas Dataframe

In [10]:
# Load excel file to dataframe

df = pd.read_excel(file)

display(df)

Unnamed: 0,id,identifier,Content.Name,MasterAssetIsContent,Content.ExpirationDate,Content.PublishedOn,Content.Impact,User_Title,Content.PublicationDate,Content.Brief,...,CreatedOn,ModifiedBy,ModifiedOn,Page Views,Engagement Rate,Bounce Rate,Exit Rate,Scroll %,% of Total Views,Cumulative % of Total Views
0,1435040,SP.c7132f4d-43c3-4375-89e7-b51648d45eab,Breast Screening Subsidies in Singapore,,,,,,,,...,2023-09-15T04:58:34.7716668Z,Administrator,2024-05-08T07:02:56.4661990Z,10855,0.800405,0.199595,0.69895,0.418885,0.216244,0.216244
1,1435071,SP.9f2d6820-7fa0-458d-bbe5-a1867805a23a,Marriage and Parenthood Schemes,,,,,,,,...,2023-09-15T04:58:54.0787819Z,Administrator,2024-05-30T12:02:40.3814940Z,5581,0.719767,0.280233,0.859635,0.397554,0.11118,0.327423
2,1434993,SP.3c70a3cb-1385-43f7-9cb5-87db2f0ded2b,MediSave,,,,,,,,...,2023-09-15T04:58:00.2628841Z,Administrator,2023-11-22T10:50:14.0229116Z,3205,0.744887,0.255113,0.703683,0.356708,0.063847,0.391271
3,1435031,SP.574ef1be-bf23-4f67-9601-ca3ffd6eb586,Hospital Bills Financial Assistance in Singapore,,,,,,,,...,2023-09-15T04:58:31.8558644Z,Administrator,2023-11-22T10:50:09.3172146Z,3077,0.777619,0.222381,0.678503,0.427771,0.061297,0.452568
4,1435043,SP.ad7b6bc7-5c90-4df8-89ea-c4ad2d40e580,Community Health Assist Scheme (CHAS) Singapore,,,,,,,,...,2023-09-15T04:58:36.0510149Z,Administrator,2024-01-17T02:36:39.2981943Z,3026,0.775265,0.224735,0.666667,0.413004,0.060281,0.512849
5,1435005,SP.7f9e13b7-0490-452a-ab59-162c143a8e2b,Enhancement for Active Seniors (EASE) by HDB,,,,,,,,...,2023-09-15T04:58:04.4448310Z,Administrator,2023-11-22T10:50:08.3334137Z,2602,0.780768,0.219232,0.814561,0.431783,0.051835,0.564684
6,1434994,SP.420e9f6a-0063-4e0a-99b9-47262e66f944,Intermediate and Long-Term Care Services Subsi...,,,,,,,,...,2023-09-15T04:58:00.3178889Z,Administrator,2023-11-22T10:50:13.2538939Z,2447,0.755546,0.244454,0.780296,0.394565,0.048747,0.613431
7,1435035,SP.293a6123-8b43-4e45-81a1-ddadb2d1c348,Hospital Bills Estimates in Singapore,,,,,,,,...,2023-09-15T04:58:33.3136214Z,Administrator,2023-11-22T10:50:15.5919918Z,2382,0.648947,0.351053,0.681555,0.417506,0.047452,0.660883
8,1435064,SP.693e9147-a750-450d-80f0-c7a898288366,MediFund,,,,,,,,...,2023-09-15T04:58:52.5170658Z,Administrator,2023-11-22T10:50:16.2758193Z,2074,0.768704,0.231296,0.666667,0.399831,0.041316,0.702199
9,1435063,SP.f7db4617-ccc1-4f4d-aa44-e69c5c07a873,​Costs and financing,,,,,,,,...,2023-09-15T04:58:52.4137214Z,Administrator,2024-05-27T07:33:19.8379757Z,2028,0.707899,0.292101,0.70508,0.431829,0.0404,0.742599


In [11]:
# Display information on dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Columns: 533 entries, id to Cumulative % of Total Views
dtypes: float64(485), int64(3), object(45)
memory usage: 112.6+ KB


### Filter Columns where all values are NaN

In [12]:
# Drop columns where all values are NaN (irrelevant columns)

df_filtered = df.dropna(axis="columns", how="all")

display(df_filtered)

Unnamed: 0,id,identifier,Content.Name,Content.StrategyCompletenessStatus,Content.NumberOfCreatedVersions,Content.ApprovedForCreation,PublishStatus,CostAndFinancing_Title,CostAndFinancing_ArticleCatNames,CostAndFinancing_FullUrl,...,CreatedOn,ModifiedBy,ModifiedOn,Page Views,Engagement Rate,Bounce Rate,Exit Rate,Scroll %,% of Total Views,Cumulative % of Total Views
0,1435040,SP.c7132f4d-43c3-4375-89e7-b51648d45eab,Breast Screening Subsidies in Singapore,,1,,Published,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:34.7716668Z,Administrator,2024-05-08T07:02:56.4661990Z,10855,0.800405,0.199595,0.69895,0.418885,0.216244,0.216244
1,1435071,SP.9f2d6820-7fa0-458d-bbe5-a1867805a23a,Marriage and Parenthood Schemes,,2,,Published,Marriage and Parenthood Schemes,"Body Care,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:54.0787819Z,Administrator,2024-05-30T12:02:40.3814940Z,5581,0.719767,0.280233,0.859635,0.397554,0.11118,0.327423
2,1434993,SP.3c70a3cb-1385-43f7-9cb5-87db2f0ded2b,MediSave,,1,,Published,MediSave,"Alerts & Advisories,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:00.2628841Z,Administrator,2023-11-22T10:50:14.0229116Z,3205,0.744887,0.255113,0.703683,0.356708,0.063847,0.391271
3,1435031,SP.574ef1be-bf23-4f67-9601-ca3ffd6eb586,Hospital Bills Financial Assistance in Singapore,,1,,Published,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:31.8558644Z,Administrator,2023-11-22T10:50:09.3172146Z,3077,0.777619,0.222381,0.678503,0.427771,0.061297,0.452568
4,1435043,SP.ad7b6bc7-5c90-4df8-89ea-c4ad2d40e580,Community Health Assist Scheme (CHAS) Singapore,M.CompletenessStatus.12,6,,Published,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:36.0510149Z,Administrator,2024-01-17T02:36:39.2981943Z,3026,0.775265,0.224735,0.666667,0.413004,0.060281,0.512849
5,1435005,SP.7f9e13b7-0490-452a-ab59-162c143a8e2b,Enhancement for Active Seniors (EASE) by HDB,,1,,Published,Enhancement for Active Seniors (EASE) by HDB,"Senior Health & Caregiving,Alerts & Advisories,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:04.4448310Z,Administrator,2023-11-22T10:50:08.3334137Z,2602,0.780768,0.219232,0.814561,0.431783,0.051835,0.564684
6,1434994,SP.420e9f6a-0063-4e0a-99b9-47262e66f944,Intermediate and Long-Term Care Services Subsi...,,1,,Published,Intermediate and Long-Term Care Services Subsi...,"Alerts and Advisories,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:00.3178889Z,Administrator,2023-11-22T10:50:13.2538939Z,2447,0.755546,0.244454,0.780296,0.394565,0.048747,0.613431
7,1435035,SP.293a6123-8b43-4e45-81a1-ddadb2d1c348,Hospital Bills Estimates in Singapore,,1,,Published,Hospital Bills Estimates in Singapore,"Conditions and Illnesses,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:33.3136214Z,Administrator,2023-11-22T10:50:15.5919918Z,2382,0.648947,0.351053,0.681555,0.417506,0.047452,0.660883
8,1435064,SP.693e9147-a750-450d-80f0-c7a898288366,MediFund,,1,,Published,MediFund,"Alerts & Advisories,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:52.5170658Z,Administrator,2023-11-22T10:50:16.2758193Z,2074,0.768704,0.231296,0.666667,0.399831,0.041316,0.702199
9,1435063,SP.f7db4617-ccc1-4f4d-aa44-e69c5c07a873,​Costs and financing,,2,,Published,​Costs and financing,"Chronic Illnesses,",https://www.healthhub.sg/a-z/costs-and-financi...,...,2023-09-15T04:58:52.4137214Z,Administrator,2024-05-27T07:33:19.8379757Z,2028,0.707899,0.292101,0.70508,0.431829,0.0404,0.742599


In [13]:
# df_filtered.to_parquet('./export-published-cost-and-financing_14062024_data.parquet')

In [14]:
# Get columns present

print(df_filtered.columns)

Index(['id', 'identifier', 'Content.Name',
       'Content.StrategyCompletenessStatus', 'Content.NumberOfCreatedVersions',
       'Content.ApprovedForCreation', 'PublishStatus',
       'CostAndFinancing_Title', 'CostAndFinancing_ArticleCatNames',
       'CostAndFinancing_FullUrl', 'CostAndFinancing_FullUrl2',
       'CostAndFinancing_FriendlyUrl', 'CostAndFinancing_ContentBody',
       'CostAndFinancing_CategoryDesc', 'CostAndFinancing_TaxKeywordTax',
       'CostAndFinancing_ENKeywords', 'CostAndFinancing_AlternateImageText',
       'CostAndFinancing_SortOrder', 'CostAndFinancing_NumberofViews',
       'CostAndFinancing_LastMonthViewCount',
       'CostAndFinancing_LastTwoMonthsView', 'CostAndFinancing_CoverImgUrl',
       'CostAndFinancing_FeatureTitle', 'CostAndFinancing_PRDescription',
       'CostAndFinancing_PRContactAddress', 'CostAndFinancing_PRLogo',
       'CostAndFinancing_PRName', 'CostAndFinancing_DocId',
       'CostAndFinancing_Featured', 'CostAndFinancing_DateModified',

### Find ContentBody Column in DataFrame

In [15]:
col = df_filtered.columns[df_filtered.columns.str.contains("ContentBody")][0]
print(col)

CostAndFinancing_ContentBody


In [16]:
raw_html = df_filtered[col]

display(raw_html.head(15))

0     <div class="ExternalClass07C58E0D957B4AA7B14FC...
1     <div class="ExternalClassE1D82270F17241E495537...
2     <div class="ExternalClass67AD25F1F8B64B349E515...
3     <div class="ExternalClassE335708125E743FDAA331...
4     <h2>What is the Community Health Assist Scheme...
5     <div class="ExternalClass26C3FCBE3D3D46728E80B...
6     <div class="ExternalClassCD5BE21C38D64C5BB4909...
7     <div class="ExternalClass3990B681758C432A8A905...
8     <div class="ExternalClass43A6934BC03041089AFFF...
9     <div class="ExternalClass7FFBD695D7A34E158E966...
10    <div class="ExternalClass99E003D691FF43D79DFF8...
11    <div class="ExternalClass6EF9A07BE0E347CDBB0B5...
12    <p>Seeking medical treatment and care is a wor...
13    <div class="ExternalClassB930DEC6C2154BB78B41C...
14    <div class="ExternalClassB2A0A390BD3F4501B4680...
Name: CostAndFinancing_ContentBody, dtype: object

## Text Extraction

In [17]:
# Extract HTML sample

sample = raw_html[0]

print(sample)

<div class="ExternalClass07C58E0D957B4AA7B14FCBA048386D06">
<p>Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking&nbsp;active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by&nbsp;going for regular health screenings.</p>
<h2>Breast Cancer Screening</h2>
<div>Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.</div>
<div>Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitatio

In [18]:
print(sample is np.nan)

False


### Method 1: Scraping ContentBody using BeautifulSoup

In [19]:
soup = BeautifulSoup(sample, "lxml")

# Some ContentBody values are wrapped with a div class HTML element
if soup.div is not None:
    soup.div.unwrap()

In [20]:
clean_text = soup.get_text()
print(clean_text)


Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.
Breast Cancer Screening
Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.
Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.

Related:
Cancer 

In [21]:
clean_text = (
    normalize("NFKC", clean_text).replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
)

# clean_text = clean_text.split("\n")
texts = clean_text.split("\n")

for i in range(len(texts)):
    texts[i] = texts[i].strip()

print(texts)

['', 'Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.', 'Breast Cancer Screening', 'Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.', 'Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.', '

In [22]:
clean_text = "\n".join(texts).strip()
print(clean_text)

Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.
Breast Cancer Screening
Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.
Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.

Related:
Cancer F

### Method 2: Scraping ContentBody using Regular Expressions

In [23]:
CLEANR = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
# CLEANR = re.compile('<p.*?>|</p>|</div>|<div.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')


def clean_html(raw_html):
    cleantext = re.sub(CLEANR, " ", raw_html)
    return cleantext


clean_text = clean_html(sample)
print(clean_text)

 
 Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings. 
 Breast Cancer Screening 
 Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications. 
 Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening. 
 
 Relat

In [24]:
clean_text = (
    normalize("NFKC", clean_text).replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
)

# clean_text = clean_text.split("\n")
texts = clean_text.split("\n")

for i in range(len(texts)):
    texts[i] = texts[i].strip()

print(texts)

['', 'Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.', 'Breast Cancer Screening', 'Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.', 'Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.', '

In [25]:
clean_text = "\n".join(texts).strip().replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
print(clean_text)

Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.
Breast Cancer Screening
Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.
Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.

Related:
Cancer F

### Method 3: Scraping ContentBody using Custom Functions

In [26]:
soup = BeautifulSoup(sample, "lxml")

print(soup)

<html><body><div class="ExternalClass07C58E0D957B4AA7B14FCBA048386D06">
<p>Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.</p>
<h2>Breast Cancer Screening</h2>
<div>Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.</div>
<div>Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitat

#### Version 1 - Strip text + Introduce formatting

In [27]:
# Works very well but prone to duplicated text for nested tags. Look into resolving non-ascii representation of characters
# TODO: Implement unicode normalisation when parsing text from HTML fragments


def parse_text(soup):
    def process_links(tag):
        links = []

        for a in tag.find_all("a"):
            title = a.get("title") or a.text.strip()
            url = a.get("href")
            links.append(f"{title}: {url}")

        return links

    def process_table(table):
        if (
            table.find_all("tr") == []
        ):  # Empty table in All You Need to Know About Childhood Immunisations
            return ""

        headers = [header.get_text(strip=True) for header in table.find_all("tr")[0]]
        headers = list(filter(lambda k: " " in k, headers))
        rows = []

        for row in table.find_all("tr")[1:]:
            cols = row.find_all("td")
            cols = [ele.get_text(strip=True).replace("\xa0", " ") for ele in cols]
            rows.append(cols)

        table_text = []

        if headers:
            table_text.append(" | ".join(headers))

        for row in rows:
            table_text.append(" | ".join(row))

        return "\n".join(table_text)

    if soup.div is not None:
        soup.div.unwrap()

    # TODO: Implement unicode normalisation when parsing text from HTML fragments
    organized_text = []

    for elem in soup.find_all(
        ["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "table", "ol"]
    ):
        if elem.name == "p":
            organized_text.append(elem.text.strip())

        elif elem.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            organized_text.append(f"\n\n{elem.text.strip()}\n")

        elif elem.name == "div":
            organized_text.append(elem.text.strip())

        elif elem.name == "ul":
            for li in elem.find_all("li"):
                organized_text.append(f"  - {li.text.strip()}")

        elif elem.name == "ol":
            for i, li in enumerate(elem.find_all("li"), 1):
                organized_text.append(f"  {i}. {li.text.strip()}")

        elif elem.name == "table":
            organized_text.append(process_table(elem))

        links = process_links(elem)
        if links:
            organized_text.append("\n")
            organized_text.extend(links)
            organized_text.append("\n")

    # print(organized_text)
    return (
        "\n".join(organized_text)
        .replace("\n\n", "\n")
        .replace("\n         ", "")
        .replace("\n      ", "")
        .replace("\u200b", "")
        .strip()
    )


print(parse_text(soup))

Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.

Breast Cancer Screening
Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.
Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.
Related:
Cancer F

#### Version 2 - Introduce Unicode Normalisation + Encode to ASCII + Decode back to UTF-8

In [28]:
# Works very well but prone to duplicated text for nested tags. Look into resolving non-ascii representation of characters
# TODO: Implement unicode normalisation when parsing text from HTML fragments


def clean_text(text):
    clean_text = (
        normalize("NFKC", text).encode("ascii", "ignore").decode("utf8").strip()
    )

    return clean_text


def process_links(tag):
    links = []

    for a in tag.find_all("a"):
        title = a.get("title") or a.text.strip()
        url = a.get("href")
        links.append(f"{title}: {url}")

    return links


def process_table(table):
    if (
        table.find_all("tr") == []
    ):  # Empty table in All You Need to Know About Childhood Immunisations
        return ""

    headers = [clean_text(header.get_text()) for header in table.find_all("tr")[0]]
    headers = list(filter(lambda k: " " in k, headers))
    rows = []

    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        cols = [clean_text(ele.get_text()) for ele in cols]
        rows.append(cols)

    table_text = []

    if headers:
        table_text.append(" | ".join(headers))

    for row in rows:
        table_text.append(" | ".join(row))

    return "\n".join(table_text)


def parse_text(soup):
    if soup.div is not None:
        soup.div.unwrap()

    # TODO: Implement unicode normalisation when parsing text from HTML fragments
    organized_text = []

    for elem in soup.find_all(
        ["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "table", "ol"]
    ):
        text = clean_text(elem.text)
        if elem.name in ["p", "div"]:
            organized_text.append(text)

        elif elem.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            organized_text.append(f"\n\n{text}\n")

        elif elem.name == "ul":
            for li in elem.find_all("li"):
                li_text = clean_text(li.text)
                organized_text.append(f"  - {li_text}")

        elif elem.name == "ol":
            for i, li in enumerate(elem.find_all("li"), 1):
                li_text = clean_text(li.text)
                organized_text.append(f"  {i}. {li_text}")

        elif elem.name == "table":
            organized_text.append(process_table(elem))

        links = process_links(elem)
        if links:
            organized_text.append("\n\n")
            organized_text.extend(links)
            organized_text.append("\n\n")

    # print(organized_text)
    # return "\n".join(organized_text).strip()
    return (
        "\n".join(organized_text)
        .replace("\n\n\n", "\n")
        .replace("\n            ", "")
        .replace("\n      ", "")
        .strip()
    )


print(parse_text(soup))

Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.
Breast Cancer Screening

Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for screening.
Related:
Cancer Facts You Cannot Ignore

Cancer Facts You Cannot Ignore: https://www.healthhub.sg/live-healthy/cancer-facts-you-cannot-ignore

Subsidy for Mammograms

Under the Health Promotion Boards (HPB)
Screen for Life (SFL)[1], women aged 50 years and above can benefit from subsidised mammogram screenings which cost $50 for Singapore citizens, $75 for Permanent Residents (PRs), $37.50 for M

In [29]:
# Extract tables - Experimental


# Notes: Need to check if table exists
def extract_table(sample):
    html_file = StringIO(sample)

    tables = pd.read_html(html_file, header=0)
    stores = []

    for i in range(len(tables)):
        json_str = tables[0].to_json(index=False, orient="records")
        store = json.loads(json_str)

        for i in range(len(store)):
            ele = store[i]
            for key, value in ele.items():
                ele[key] = normalize("NFKC", str(value).replace("\u200b", ""))
            store[i] = ele

        stores.append(store)

    if not stores:
        return None

    return stores


# print(extract_table(sample))

In [30]:
# minified = htmlmin.minify(sample, remove_empty_space=True)
# print(minified)

In [31]:
# Parse HTML via BeautifulSoup

soup = BeautifulSoup(sample, "lxml")

In [32]:
def extract_tags(soup):
    tags = set()

    for tag in soup.find_all(True):
        tags.add(tag.name)

    return list(tags)


print(extract_tags(soup))

['ul', 'li', 'tr', 'em', 'br', 'table', 'td', 'tbody', 'html', 'sup', 'a', 'strong', 'p', 'div', 'h2', 'hr', 'ol', 'h3', 'body', 'span']


In [33]:
# Display all headers from article}


def extract_headers(soup):
    titles = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])

    headers = []
    for title in titles:
        tag = title.name
        text = title.get_text()
        records = text, tag
        headers.append(records)

    # print('List all the header tags :', *titles, sep='\n\n', end="\n\n")
    return headers


print(extract_headers(soup))

[('Breast Cancer Screening', 'h2'), ('Subsidy for Mammograms', 'h3'), ('Tips on Breast Cancer Screening Subsidies', 'h3'), ('Where to Go for Screenings', 'h2'), ('Singapore Cancer Society Clinic @ Bishan', 'h3'), ('Polyclinics', 'h3'), ('Useful Links', 'h2'), ('Contact Information', 'h2'), ('\nScreen for Life', 'h3')]


In [34]:
def extract_urls_list(soup):
    url_records = []

    for link in soup.find_all("a"):
        url = link.get("href")
        title = link.get("title")
        text = link.get_text()

        if title is not None:
            records = title, url
        else:
            records = text, url

        url_records.append(records)

    if not url_records:
        return None

    return url_records


print(extract_urls_list(soup))

[('Cancer Facts You Cannot Ignore', 'https://www.healthhub.sg/live-healthy/cancer-facts-you-cannot-ignore'), ('Screen for Life', 'https://www.healthhub.sg/programmes/Screen_for_Life'), ('[1]', '#footnoteone'), ('Screen for Life.', 'https://www.healthhub.sg/programmes/Screen_for_Life'), ('free mammography services', 'https://www.singaporecancersociety.org.sg/get-screened/breast-cancer/mammogram.html'), ('Breast Cancer Screening Now Free for All Singaporeans', 'https://www.healthhub.sg/live-healthy/breast-cancer-screening-now-free-for-eligible-singaporeans'), ('online form', 'https://for.sg/nhgd-mmgappt'), ('HealthHub', 'https://eservices.healthhub.sg/Appointments'), ('online form', 'https://form.gov.sg/6201c1c93bc12500135730b3'), ('online form', 'https://form.gov.sg/5f50934b1e0c6e0011fad28e'), ('Understanding Breast Cancer', 'http://www.healthhub.sg/a-z/diseases-and-conditions/breastcancer'), ('Breast Cancer Foundation', 'http://www.bcf.org.sg/'), ('Singapore Cancer Society', 'http://ww

In [35]:
def extract_urls_dict(soup):
    urls_dict = dict()
    for link in soup.find_all("a"):
        url = link.get("href")
        title = link.get("title")
        text = link.get_text()

        if url not in urls_dict:
            if title is not None:
                urls_dict[url] = title
            else:
                urls_dict[url] = text

    res = dict((v, k) for k, v in urls_dict.items())

    if not res:
        return None

    return res


print(extract_urls_dict(soup))

{'Cancer Facts You Cannot Ignore': 'https://www.healthhub.sg/live-healthy/cancer-facts-you-cannot-ignore', 'Screen for Life': 'https://www.healthhub.sg/programmes/Screen_for_Life', '[1]': '#footnoteone', 'free mammography services': 'https://www.singaporecancersociety.org.sg/get-screened/breast-cancer/mammogram.html', 'Breast Cancer Screening Now Free for All Singaporeans': 'https://www.healthhub.sg/live-healthy/breast-cancer-screening-now-free-for-eligible-singaporeans', 'online form': 'https://form.gov.sg/5f50934b1e0c6e0011fad28e', 'HealthHub': 'https://eservices.healthhub.sg/Appointments', 'Understanding Breast Cancer': 'http://www.healthhub.sg/a-z/diseases-and-conditions/breastcancer', 'Breast Cancer Foundation': 'http://www.bcf.org.sg/', 'Singapore Cancer Society': 'http://www.singaporecancersociety.org.sg/', 'HPB_Mailbox@hpb.gov.sg': 'mailto:ScreenForLife@hpb.gov.sg', '5 Ways To Psych Yourself For A Mammogram': 'https://www.healthhub.sg/live-healthy/5-ways-to-psych-yourself-for-a

In [36]:
def extract_info(sample):
    if sample is np.nan:
        return

    soup = BeautifulSoup(sample, "lxml")

    tags = extract_tags(soup)
    headers = extract_headers(soup)
    urls = extract_urls_list(soup)

    tables = None
    if soup.find_all("table") != []:
        try:
            tables = extract_table(sample)
            # tables_str = json.dumps(tables)
        except Exception as e:
            print(e)

    text = parse_text(soup)
    return tags, headers, urls, tables, text


print(extract_info(sample))

(['ul', 'li', 'tr', 'em', 'br', 'table', 'td', 'tbody', 'html', 'sup', 'a', 'strong', 'p', 'div', 'h2', 'hr', 'ol', 'h3', 'body', 'span'], [('Breast Cancer Screening', 'h2'), ('Subsidy for Mammograms', 'h3'), ('Tips on Breast Cancer Screening Subsidies', 'h3'), ('Where to Go for Screenings', 'h2'), ('Singapore Cancer Society Clinic @ Bishan', 'h3'), ('Polyclinics', 'h3'), ('Useful Links', 'h2'), ('Contact Information', 'h2'), ('\nScreen for Life', 'h3')], [('Cancer Facts You Cannot Ignore', 'https://www.healthhub.sg/live-healthy/cancer-facts-you-cannot-ignore'), ('Screen for Life', 'https://www.healthhub.sg/programmes/Screen_for_Life'), ('[1]', '#footnoteone'), ('Screen for Life.', 'https://www.healthhub.sg/programmes/Screen_for_Life'), ('free mammography services', 'https://www.singaporecancersociety.org.sg/get-screened/breast-cancer/mammogram.html'), ('Breast Cancer Screening Now Free for All Singaporeans', 'https://www.healthhub.sg/live-healthy/breast-cancer-screening-now-free-for-e

In [37]:
def write_to_txt(dir_path, fname, text):
    fpath = f"{dir_path}/{fname}.txt"

    try:
        os.makedirs(dir_path, exist_ok=True)
        with open(fpath, "w") as f:
            f.write(text)
    except OSError as error:
        print(error)

In [38]:
def process_xlsx(archive, xlsx_file, dir_path):
    dname = xlsx_file.split(".")[0].split("/")[-1]
    output_dir = f"{dir_path}/{dname}"
    print("\n\n", output_dir)

    file = archive.open(xlsx_file)
    df = pd.read_excel(file)
    df_processed = df.dropna(axis="columns", how="all")

    # "Content_Body" instead of "ContentBody" in export-published-live-healthy-articles_14062024_data.xlsx
    # "Content_x0020_Body" instead of "ContentBody" in export-published-programs_14062024_data.xlsx
    col = df_processed.columns[
        df_processed.columns.str.contains("ContentBody")
        | df_processed.columns.str.contains("Content_Body")
        | df_processed.columns.str.contains("Content_x0020_Body")
    ][0]
    article_names = df_processed["Content.Name"]
    raw_htmls = df_processed[col]

    for i in range(n := len(raw_htmls)):
        sample = raw_htmls[i]
        # Some articles uses slashes for medications
        fname = article_names[i].replace("/", "-")
        print(fname)

        if sample is not np.nan:
            tags, headers, urls, tables, text = extract_info(sample)
            write_to_txt(output_dir, fname, text)

In [39]:
# Function to extract xlsx files from archive


def create_archive(zip_path):
    archive = zipfile.ZipFile(zip_path, "r")
    return archive


def get_xlsx_from_archive(archive):
    files_and_dirs = archive.namelist()
    xlsx_files = list(filter(lambda k: k.split(".")[-1] == "xlsx", files_and_dirs))
    return xlsx_files


print(get_xlsx_from_archive(create_archive("../data/GenAI - Full Content Export.zip")))

['Content Export with Data/export-published-cost-and-financing_14062024_data.xlsx', 'Content Export with Data/export-published-diseases-and-conditions_13062024_data.xlsx', 'Content Export with Data/export-published-health-statistics_14062024_data.xlsx', 'Content Export with Data/export-published-live-healthy-articles_14062024_data.xlsx', 'Content Export with Data/export-published-medical-care-and-facilities_14062024_data.xlsx', 'Content Export with Data/export-published-medications_14062024_data.xlsx', 'Content Export with Data/export-published-program-sub-pages_14062024_data.xlsx', 'Content Export with Data/export-published-programs_14062024_data.xlsx', 'Content Export with Data/export-published-support-group-and-others_14062024_data.xlsx']


In [40]:
def main():
    archive_path = "../data/GenAI - Full Content Export.zip"
    output_dir_path = "../data/processed"
    archive = create_archive(archive_path)
    xlsx_files = get_xlsx_from_archive(archive)

    for i in range(len(xlsx_files)):
        process_xlsx(archive, xlsx_files[i], output_dir_path)

In [41]:
main()

# # Errors:
# invalid literal for int() with base 10: 'h2'
# [Errno 63] File name too long: '../data/processed/export-published-live-healthy-articles_14062024_data/                                                                                                                                                                                                         Books for your growing child (Toddler and Preschooler).txt'
# No tables found matching pattern '.+'
# /var/folders/3n/y5_h0fxs0bv2mhb7bf_fpmb80000gn/T/ipykernel_53004/2874476162.py:5: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
#   soup = BeautifulSoup(sample, 'lxml')



 ../data/processed/export-published-cost-and-financing_14062024_data
Breast Screening Subsidies in Singapore
Marriage and Parenthood Schemes
MediSave
Hospital Bills Financial Assistance in Singapore
Community Health Assist Scheme (CHAS) Singapore
Enhancement for Active Seniors (EASE) by HDB 
Intermediate and Long-Term Care Services Subsidies 
Hospital Bills Estimates in Singapore
MediFund
​Costs and financing
ElderShield
Seniors’ Mobility and Enabling Fund
Bill Presentment for Public Healthcare Institutions
MediShield Life
Foreign Domestic Worker Levy Concession for Persons with Disabilities
MediSave Claims for Polyclinics
Outpatient Costs for Chronic Disease (How to Reduce Cost)
Integrated Shield Plans (IPs)
Home Caregiving Grant (HCG)
Pioneer Generation Disability Assistance Scheme (PioneerDAS)
Caregivers Training Grant (CTG)
Medical Fee Exemption Card
Haze Subsidy Scheme
Interim Disability Assistance
1Cost and Financing
1test-2
1testcnf2


 ../data/processed/export-published-disea

  soup = BeautifulSoup(sample, 'lxml')


Vaccination Clinic
Subsidised Dental Treatments
​Women Health Services
Blood Glucose Meter to Monitor Blood Sugar Levels
Travel Vaccinations and Health Advice
Request for Medical Report
Admissions Buddy: A Health Portal for Patients
Advance Care Planning
Nursing Home Respite Care
Physiotherapy Treatment
Nutrition and Dietetics: Dietitian Services
Senior Activity Centres in Singapore
Podiatry 
Weight Management Programmes
Mobile Bone Mineral Densitometry 
Paediatric Dentistry: Child’s First Encounter
Baby Friendly Hospital Initiative
Diagnostics Services: Know Your Health Inside Out
Allergy Services: KKH
Stroke: Post-stroke Care
Home Health Care for Seniors 
Fall Risk Assessment Programme for Fall Prevention
Inpatient Hospice Facility Care
Respite Care for Caregivers Provided by Senior Care Centres
Diabetic Retinal Photography
A New Approach To Mental Health Treatments 
Community-Acquired Pneumonia (CAP) Treatment: The Airway Programme
Medication, Dispensing and Counselling (Pharmacy Se

### Method 4: Convert HTML to Markdown

In [42]:
# !pip install markdownify

In [43]:
from markdownify import markdownify

In [44]:
# Convert HTML to Markdown
markdown_text = markdownify(sample)

# Display converted text
print(markdown_text.replace("\n\n\n", "\n\n").strip())

Breast cancer is the number one cancer among women in Singapore. You can protect yourself by taking active preventive measures to protect yourself against them. Besides being well informed about these conditions, you can protect yourself by going for regular health screenings.

Breast Cancer Screening
-----------------------

Beyond the recommended monthly breast self-examinations, the best way to protect yourself from breast cancer is to go for regular mammograms. A mammogram can detect tiny lumps that cannot be felt by the hand. Early detection, followed by treatment and good control of the condition can result in better chances of surviving this cancer, lowering the risk of serious complications.
Women who are 50 years old and above are recommended to go for a mammogram once every two years. Women 40 to 49, can screen for breast cancer, annually, provided their doctor has discussed the benefits and limitations with them. It is important to make an informed choice about going for scr

# END

***