# Monitoring the quality of the extracted text


## Import relevant libraries


In [1]:
import numpy as np
import plotly.express as px
from bs4 import BeautifulSoup

## Functions to check for nested divs


In [2]:
def check_nested_divs_recursive(element):
    is_nested_div = False

    # Check for nested_divs by comparing child and parent name via recursion
    if element.name == "div" and element.parent.name == "div":
        is_nested_div = True

    # Find all elements
    direct_children = element.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    )
    for child in direct_children:
        check_nested_div = check_nested_divs_recursive(child)
        is_nested_div = is_nested_div or check_nested_div

    return is_nested_div

In [3]:
def check_for_nested_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    has_nested_div = False
    for tag in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    ):
        check_nested_div = check_nested_divs_recursive(tag)
        has_nested_div = has_nested_div or check_nested_div

    # print(f"\nHas Nested div: {has_nested_div}")

    return has_nested_div

## Load Merged Data from Kedro Catalog


In [4]:
%load_ext kedro.ipython

In [5]:
# ruff: noqa: F821
catalog.list()


[1m[[0m
    [32m'all_contents'[0m,
    [32m'all_contents_standardized'[0m,
    [32m'all_contents_extracted'[0m,
    [32m'all_extracted_text'[0m,
    [32m'merged_data'[0m,
    [32m'raw_word_counts'[0m,
    [32m'log_word_counts'[0m,
    [32m'flag_for_removal_by_type'[0m,
    [32m'filtered_data'[0m,
    [32m'doc_embeddings'[0m,
    [32m'word_embeddings'[0m,
    [32m'filtered_data_with_keywords'[0m,
    [32m'parameters'[0m,
    [32m'params:columns_to_keep'[0m,
    [32m'params:columns_to_keep.cost-and-financing'[0m,
    [32m'params:columns_to_keep.diseases-and-conditions'[0m,
    [32m'params:columns_to_keep.health-statistics'[0m,
    [32m'params:columns_to_keep.live-healthy-articles'[0m,
    [32m'params:columns_to_keep.medical-care-and-facilities'[0m,
    [32m'params:columns_to_keep.medications'[0m,
    [32m'params:columns_to_keep.program-sub-pages'[0m,
    [32m'params:columns_to_keep.programs'[0m,
    [32m'params:columns_to_keep.support-group

In [6]:
# ruff: noqa: F821
merged_data = catalog.load("merged_data")

display(merged_data)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,cumulative_percentage_total_views,content_category,to_remove,type,has_table,has_image,related_sections,extracted_links,extracted_headers,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/breas...,breast-cancer-screening-subsidies,Here’s all you need to know about breast cance...,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",...,0.216244,cost-and-financing,False,,True,False,[Cancer Facts You Cannot Ignore],"[[Cancer Facts You Cannot Ignore, https://www....","[[Breast Cancer Screening, h2], [Subsidy for M...",Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/marri...,marriage_parenthood_scheme,New parents and couples looking to conceive ca...,"<div class=""ExternalClassE1D82270F17241E495537...",...,0.327423,cost-and-financing,False,,True,False,"[MediSave, Baby Bonus What You Need to Know, I...","[[How to Submit Claims, https://crms.moh.gov.s...","[[MediSave Maternity Package, h2], [Examples o...",MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/medisave,medisave,MediSave is the national medical savings schem...,"<div class=""ExternalClass67AD25F1F8B64B349E515...",...,0.391271,cost-and-financing,False,,False,False,[A Way to Reduce Outpatient Cost for Chronic D...,[[A Way to Reduce Outpatient Cost for Chronics...,"[[, h2], [What is MediSave?, h2], [Contributin...","What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/finan...,financial-assistance-for-local-patients-in-sin...,Having trouble paying your medical bill? Here’...,"<div class=""ExternalClassE335708125E743FDAA331...",...,0.452568,cost-and-financing,False,,False,False,[],"[[Medifund, https://www.healthhub.sg/a-z/costs...","[[Medifund: Hospital Bill Payment Assistance, ...",Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/chas,chas,"With a CHAS card, all Singapore citizens can r...",<h2>What is the Community Health Assist Scheme...,...,0.512849,cost-and-financing,False,,True,True,[All Pioneer Generation (PG) and Merdeka Gener...,"[[www.chas.sg/apply, https://www.chas.sg/apply...",[[What is the Community Health Assist Scheme (...,What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,transitional-care-programme-for-heart-failure,The team from NUHCS gives support to heart fai...,"<div class=""ExternalClassFC126593610D4F0587A4B...",...,0.880618,support-group-and-others,False,,False,True,[],"[[charles_wu@nuhs.edu.sg, mailto: charles_wu@n...",[[Transitional Care for Heart Failure Patients...,Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,2015-NNI-support-group,Read on for a list of brain injury support gro...,"<div class=""ExternalClass7C92735B78174928B287D...",...,0.914412,support-group-and-others,False,,False,False,[],"[[here, https://www.nni.com.sg/patient-care/br...","[[, h2], [Brain Tumour Society (Singapore), h2...",Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,ambulatory-nutrition-support,Read about the ambulatory support benefits one...,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",...,0.944373,support-group-and-others,False,,False,False,[],"[[Nutrition and Dietetics, https://www.healthh...",[[The Importance of Ambulatory Nutrition Suppo...,The Importance of Ambulatory Nutrition Support...
2611,1440766,LapBandits Support Group (Singapore),LapBandits Support Group (Singapore),"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,singapore-lapbandits-support-group,Have you just undergone bariatric surgery for ...,"<div class=""ExternalClassA4C749C7DB7647FBB6DEB...",...,0.973058,support-group-and-others,True,Below Word Count,False,False,[],[],[[About Khoo Teck Puat Hospitals LapBandits Su...,About Khoo Teck Puat Hospitals LapBandits Supp...


### Keep only rows where `to_remove` is False


In [7]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,cumulative_percentage_total_views,content_category,to_remove,type,has_table,has_image,related_sections,extracted_links,extracted_headers,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/breas...,breast-cancer-screening-subsidies,Here’s all you need to know about breast cance...,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",...,0.216244,cost-and-financing,False,,True,False,[Cancer Facts You Cannot Ignore],"[[Cancer Facts You Cannot Ignore, https://www....","[[Breast Cancer Screening, h2], [Subsidy for M...",Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/marri...,marriage_parenthood_scheme,New parents and couples looking to conceive ca...,"<div class=""ExternalClassE1D82270F17241E495537...",...,0.327423,cost-and-financing,False,,True,False,"[MediSave, Baby Bonus What You Need to Know, I...","[[How to Submit Claims, https://crms.moh.gov.s...","[[MediSave Maternity Package, h2], [Examples o...",MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/medisave,medisave,MediSave is the national medical savings schem...,"<div class=""ExternalClass67AD25F1F8B64B349E515...",...,0.391271,cost-and-financing,False,,False,False,[A Way to Reduce Outpatient Cost for Chronic D...,[[A Way to Reduce Outpatient Cost for Chronics...,"[[, h2], [What is MediSave?, h2], [Contributin...","What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/finan...,financial-assistance-for-local-patients-in-sin...,Having trouble paying your medical bill? Here’...,"<div class=""ExternalClassE335708125E743FDAA331...",...,0.452568,cost-and-financing,False,,False,False,[],"[[Medifund, https://www.healthhub.sg/a-z/costs...","[[Medifund: Hospital Bill Payment Assistance, ...",Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/chas,chas,"With a CHAS card, all Singapore citizens can r...",<h2>What is the Community Health Assist Scheme...,...,0.512849,cost-and-financing,False,,True,True,[All Pioneer Generation (PG) and Merdeka Gener...,"[[www.chas.sg/apply, https://www.chas.sg/apply...",[[What is the Community Health Assist Scheme (...,What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,Breast-cancer-support-group-singapore,Learn how breast cancer support groups help br...,"<div class=""ExternalClassE9019B25CD2A40F3948CD...",...,0.845546,support-group-and-others,False,,False,False,[],"[[here, https://www.ncis.com.sg/events/Pages/E...","[[Breast Cancer Support at NCIS, h2], [Our Mis...","NCIS Breast Support Group provides knowledge, ..."
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,transitional-care-programme-for-heart-failure,The team from NUHCS gives support to heart fai...,"<div class=""ExternalClassFC126593610D4F0587A4B...",...,0.880618,support-group-and-others,False,,False,True,[],"[[charles_wu@nuhs.edu.sg, mailto: charles_wu@n...",[[Transitional Care for Heart Failure Patients...,Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,2015-NNI-support-group,Read on for a list of brain injury support gro...,"<div class=""ExternalClass7C92735B78174928B287D...",...,0.914412,support-group-and-others,False,,False,False,[],"[[here, https://www.nni.com.sg/patient-care/br...","[[, h2], [Brain Tumour Society (Singapore), h2...",Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,ambulatory-nutrition-support,Read about the ambulatory support benefits one...,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",...,0.944373,support-group-and-others,False,,False,False,[],"[[Nutrition and Dietetics, https://www.healthh...",[[The Importance of Ambulatory Nutrition Suppo...,The Importance of Ambulatory Nutrition Support...


## Keep Relevant Columns


In [8]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "title",
        "content_category",
        "content_body",
        "extracted_content_body",
    ]
]  # "title", "full_url",

display(df_extracted)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,cost-and-financing,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,cost-and-financing,"<div class=""ExternalClassE1D82270F17241E495537...",MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,cost-and-financing,"<div class=""ExternalClass67AD25F1F8B64B349E515...","What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"<div class=""ExternalClassE335708125E743FDAA331...",Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,cost-and-financing,<h2>What is the Community Health Assist Scheme...,What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,support-group-and-others,"<div class=""ExternalClassE9019B25CD2A40F3948CD...","NCIS Breast Support Group provides knowledge, ..."
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,support-group-and-others,"<div class=""ExternalClassFC126593610D4F0587A4B...",Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,support-group-and-others,"<div class=""ExternalClass7C92735B78174928B287D...",Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,support-group-and-others,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",The Importance of Ambulatory Nutrition Support...


## Check for Nested Divs in Content Body


In [9]:
df_extracted["has_nested_div"] = df_extracted.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

display(df_extracted[df_extracted["has_nested_div"]])

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,cost-and-financing,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",Breast cancer is the number one cancer among w...,True
2,1434993,MediSave,MediSave,cost-and-financing,"<div class=""ExternalClass67AD25F1F8B64B349E515...","What is MediSave?\nMediSave, introduced in Apr...",True
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"<div class=""ExternalClassE335708125E743FDAA331...",Patients or family members who have difficulty...,True
5,1435005,Enhancement for Active Seniors (EASE) by HDB,Enhancement for Active Seniors (EASE) by HDB,cost-and-financing,"<div class=""ExternalClass26C3FCBE3D3D46728E80B...",Introduction to the Enhancement for Active Sen...,True
6,1434994,Intermediate and Long-Term Care Services Subsi...,Intermediate and Long-Term Care Services Subsi...,cost-and-financing,"<div class=""ExternalClassCD5BE21C38D64C5BB4909...",What are Intermediate and Long-Term Care Servi...,True
...,...,...,...,...,...,...,...
2602,1440797,Cancer (Supportive Care and Other Services),Cancer (Supportive Care and Other Services),support-group-and-others,"<div class=""ExternalClass3EFDC48BF0184F1294A48...",Palliative Care\nWhat is the meaning of pallia...,True
2605,1440759,Cancer Patient Support Groups,Cancer Patient Support Groups,support-group-and-others,"<div class=""ExternalClass97D8B36FEAD24A8A8CA16...",Finding Support\nBeing diagnosed with cancer i...,True
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,support-group-and-others,"<div class=""ExternalClass7C92735B78174928B287D...",Brain Tumour Society (Singapore)\nThe Brain Tu...,True
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,support-group-and-others,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",The Importance of Ambulatory Nutrition Support...,True


In [10]:
df_extracted["has_nested_div"].value_counts()


has_nested_div
[3;92mTrue[0m     [1;36m1094[0m
[3;91mFalse[0m    [1;36m1090[0m
Name: count, dtype: int64

## Inspecting for Poor Text Extraction


### Using the Word Count of the Largest Paragraph in each Article as a Heuristic


In [11]:
def max_paragraph_size(row):
    article = row["extracted_content_body"]
    if article is None:
        return 0

    # print(type(article))
    paragraphs = article.split("\n")
    if paragraphs is None:
        return 0

    max_paragraph_size = 0
    for paragraph in paragraphs:
        paragraph_size = len(paragraph.split())
        max_paragraph_size = max(max_paragraph_size, paragraph_size)

    return max_paragraph_size

In [12]:
df_extracted["max_paragraph_size"] = df_extracted.apply(max_paragraph_size, axis=1)

In [13]:
display(df_extracted)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div,max_paragraph_size
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,cost-and-financing,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",Breast cancer is the number one cancer among w...,True,60
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,cost-and-financing,"<div class=""ExternalClassE1D82270F17241E495537...",MediSave Maternity Package\nWith the MediSave ...,False,102
2,1434993,MediSave,MediSave,cost-and-financing,"<div class=""ExternalClass67AD25F1F8B64B349E515...","What is MediSave?\nMediSave, introduced in Apr...",True,43
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"<div class=""ExternalClassE335708125E743FDAA331...",Patients or family members who have difficulty...,True,65
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,cost-and-financing,<h2>What is the Community Health Assist Scheme...,What is the Community Health Assist Scheme (CH...,False,107
...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,support-group-and-others,"<div class=""ExternalClassE9019B25CD2A40F3948CD...","NCIS Breast Support Group provides knowledge, ...",False,51
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,support-group-and-others,"<div class=""ExternalClassFC126593610D4F0587A4B...",Heart failure is the leading cause of rehospit...,False,79
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,support-group-and-others,"<div class=""ExternalClass7C92735B78174928B287D...",Brain Tumour Society (Singapore)\nThe Brain Tu...,True,39
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,support-group-and-others,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",The Importance of Ambulatory Nutrition Support...,True,53


### Visualisng the `Max Paragraph Size` of each article (Log-normal)


In [14]:
fig = px.histogram(df_extracted, x="max_paragraph_size", nbins=100)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

#### Inspecting the tails of the distribution


In [15]:
print(
    df_extracted["max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

0.001     11.000
0.021     27.000
0.136     45.000
0.500     66.000
0.841     93.000
0.977    136.000
0.999    363.634
Name: max_paragraph_size, dtype: float64


### Visualising the Normal Distribution using `log_max_paragraph_size`


In [16]:
df_extracted["log_max_paragraph_size"] = np.log(df_extracted["max_paragraph_size"])

#### Setting the Lower & Upper Threshold


In [17]:
import math

In [18]:
lower_threshold = np.log(
    math.ceil(df_extracted["max_paragraph_size"].quantile([0.025]).values[0] / 10) * 10
)
print(lower_threshold)

3.4011973816621555


In [19]:
upper_threshold = np.log(
    math.floor(df_extracted["max_paragraph_size"].quantile([0.975]).values[0] / 10) * 10
)
print(upper_threshold)

4.867534450455582


In [20]:
fig = px.histogram(df_extracted, x="log_max_paragraph_size", nbins=100)
fig.update_layout(
    title_text=f"log(word_count) distribution for Paragraph: {upper_threshold}",
    xaxis_title_text="log(word_count)",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [21]:
# Presentation of Percentile for 0, 1, 2, 3 std dev from mean
print(
    df_extracted["log_max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

0.001    2.397895
0.021    3.295837
0.136    3.806662
0.500    4.189655
0.841    4.532599
0.977    4.912655
0.999    5.896146
Name: log_max_paragraph_size, dtype: float64


#### Adding the `percentile` rank for each record


In [22]:
df_extracted["percentile_rank"] = df_extracted["log_max_paragraph_size"].rank(
    pct=True, ascending=True
)

In [23]:
display(df_extracted)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div,max_paragraph_size,log_max_paragraph_size,percentile_rank
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,cost-and-financing,"<div class=""ExternalClass07C58E0D957B4AA7B14FC...",Breast cancer is the number one cancer among w...,True,60,4.094345,0.339973
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,cost-and-financing,"<div class=""ExternalClassE1D82270F17241E495537...",MediSave Maternity Package\nWith the MediSave ...,False,102,4.624973,0.907967
2,1434993,MediSave,MediSave,cost-and-financing,"<div class=""ExternalClass67AD25F1F8B64B349E515...","What is MediSave?\nMediSave, introduced in Apr...",True,43,3.761200,0.110806
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"<div class=""ExternalClassE335708125E743FDAA331...",Patients or family members who have difficulty...,True,65,4.174387,0.413004
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,cost-and-financing,<h2>What is the Community Health Assist Scheme...,What is the Community Health Assist Scheme (CH...,False,107,4.672829,0.931090
...,...,...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,support-group-and-others,"<div class=""ExternalClassE9019B25CD2A40F3948CD...","NCIS Breast Support Group provides knowledge, ...",False,51,3.931826,0.221383
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,support-group-and-others,"<div class=""ExternalClassFC126593610D4F0587A4B...",Heart failure is the leading cause of rehospit...,False,79,4.369448,0.709478
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,support-group-and-others,"<div class=""ExternalClass7C92735B78174928B287D...",Brain Tumour Society (Singapore)\nThe Brain Tu...,True,39,3.663562,0.080815
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,support-group-and-others,"<div class=""ExternalClass3FABAC9D59A64BCAB96D3...",The Importance of Ambulatory Nutrition Support...,True,53,3.970292,0.244734


### Inspection articles higher than the upper threshold


In [24]:
df_inspect_higher = df_extracted[
    df_extracted["log_max_paragraph_size"] > upper_threshold
]
df_inspect_higher.sort_values(by="max_paragraph_size", ascending=False, inplace=True)

print(df_inspect_higher.shape)

(59, 10)


In [25]:
display(df_inspect_higher)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div,max_paragraph_size,log_max_paragraph_size,percentile_rank
537,1445442,Understanding Health Supplements,Understanding Health Supplements,live-healthy-articles,"<div class=""ExternalClassF94BD7633A8B46E2A3436...",What Are Health Supplements?\nHealth or dietar...,False,624,6.43615,1.0
218,1437557,Uterine Cancer,Uterine Cancer,diseases-and-conditions,"<div class=""ExternalClass25D7983FE7294530A526F...",Cancer of the uterus is the 3rd most common ca...,False,510,6.234411,0.999542
1478,1442965,Saving a limb,Saving a limb,live-healthy-articles,"<div class=""ExternalClass9622EDC3AD6D4B2DA9D80...",Dr Sadhana Chandrasekar is a senior consultant...,False,364,5.897154,0.999084
546,1444636,Why Is Sleep Important for Kids?,Why Is Sleep Important for Kids?,live-healthy-articles,"<div class=""ExternalClass75413B19A5234323A00E7...",Importance of Sleep for Children\nChildren nee...,True,362,5.891644,0.998626
872,1443482,"Caregivers Must Take Care, Too","Caregivers Must Take Care, Too",live-healthy-articles,"<div class=""ExternalClassC9738886EC5E4F0E92DC7...",Caregiving for the elderly with dementia is a ...,False,258,5.55296,0.998168
915,1443348,Healthy Ageing: Eating Well for Our Age,Healthy Ageing: Eating Well for Our Age,live-healthy-articles,"<div class=""ExternalClassF7D948846E3345389C990...","The silver years can be immensely fulfilling, ...",False,230,5.438079,0.997711
1896,1440521,Homatropine Eye Drop,Homatropine Eye Drop,medications,"<div class=""ExternalClassF75B08D3AB8A4BA6BAE9C...",What is this medication for?\nHomatropine belo...,False,215,5.370638,0.997253
1110,1442770,Eating for Healthy Ageing,Eating for Healthy Ageing,live-healthy-articles,"<div class=""ExternalClass9690BD1F77AC441D862B3...",Eat Well to Live Young\nDo you yearn to be you...,False,209,5.342334,0.996795
2091,1470769,Doxorubicin,Doxorubicin,medications,<h2>What is this medication for?</h2><p>Doxoru...,What is this medication for?\nDoxorubicin is a...,False,207,5.332719,0.996337
2070,1470786,Paclitaxel,Paclitaxel,medications,<h2>What is this medication for?</h2><p>Paclit...,What is this medication for?\nPaclitaxel is a ...,False,206,5.327876,0.99565


In [26]:
fig = px.histogram(df_inspect_higher, x="max_paragraph_size", nbins=50)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

In [27]:
# Export file to Excel

# df_inspect_higher.to_excel("inspection.xlsx")

#### Remove articles with nested div containers


In [28]:
df_inspect_higher["has_nested_div"] = df_inspect_higher.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

df_no_nested_divs = df_inspect_higher[~df_inspect_higher["has_nested_div"]]
display(
    df_no_nested_divs.sort_values(
        by=["max_paragraph_size"], ascending=False, inplace=False
    )
)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div,max_paragraph_size,log_max_paragraph_size,percentile_rank
537,1445442,Understanding Health Supplements,Understanding Health Supplements,live-healthy-articles,"<div class=""ExternalClassF94BD7633A8B46E2A3436...",What Are Health Supplements?\nHealth or dietar...,False,624,6.43615,1.0
218,1437557,Uterine Cancer,Uterine Cancer,diseases-and-conditions,"<div class=""ExternalClass25D7983FE7294530A526F...",Cancer of the uterus is the 3rd most common ca...,False,510,6.234411,0.999542
1478,1442965,Saving a limb,Saving a limb,live-healthy-articles,"<div class=""ExternalClass9622EDC3AD6D4B2DA9D80...",Dr Sadhana Chandrasekar is a senior consultant...,False,364,5.897154,0.999084
872,1443482,"Caregivers Must Take Care, Too","Caregivers Must Take Care, Too",live-healthy-articles,"<div class=""ExternalClassC9738886EC5E4F0E92DC7...",Caregiving for the elderly with dementia is a ...,False,258,5.55296,0.998168
915,1443348,Healthy Ageing: Eating Well for Our Age,Healthy Ageing: Eating Well for Our Age,live-healthy-articles,"<div class=""ExternalClassF7D948846E3345389C990...","The silver years can be immensely fulfilling, ...",False,230,5.438079,0.997711
1896,1440521,Homatropine Eye Drop,Homatropine Eye Drop,medications,"<div class=""ExternalClassF75B08D3AB8A4BA6BAE9C...",What is this medication for?\nHomatropine belo...,False,215,5.370638,0.997253
1110,1442770,Eating for Healthy Ageing,Eating for Healthy Ageing,live-healthy-articles,"<div class=""ExternalClass9690BD1F77AC441D862B3...",Eat Well to Live Young\nDo you yearn to be you...,False,209,5.342334,0.996795
2091,1470769,Doxorubicin,Doxorubicin,medications,<h2>What is this medication for?</h2><p>Doxoru...,What is this medication for?\nDoxorubicin is a...,False,207,5.332719,0.996337
2090,1470790,Nab-Paclitaxel,Nab-Paclitaxel,medications,<h2>What is this medication for?</h2><p>Nab-Pa...,What is this medication for?\nNab-Paclitaxel i...,False,206,5.327876,0.99565
2070,1470786,Paclitaxel,Paclitaxel,medications,<h2>What is this medication for?</h2><p>Paclit...,What is this medication for?\nPaclitaxel is a ...,False,206,5.327876,0.99565


In [29]:
df_inspect_higher["has_nested_div"].value_counts()


has_nested_div
[3;91mFalse[0m    [1;36m37[0m
[3;92mTrue[0m     [1;36m22[0m
Name: count, dtype: int64

In [30]:
query = df_no_nested_divs
extracted_text = query.iloc[0]["extracted_content_body"]

print(extracted_text)

What Are Health Supplements?
Health or dietary supplements refer to a diverse group of products commonly consumed for the purpose of supplementing the diet and enhancing health. These products typically contain ingredients from natural sources and are not meant to prevent, treat, cure or alleviate the symptoms of medical diseases or conditions.
They usually come in dosage forms such as capsules, soft gels and tablets. Some examples of health supplements include vitamins, minerals (e.g. calcium, iron, magnesium) and herbal supplements (e.g. Echinacea, Guarana).

How Are Health Supplements Regulated in Singapore?
As ingredients used in health supplements are generally well-established through the experience of safe use and are not intended for medicinal purposes, health supplements do not require approval and are not evaluated by the Health Sciences Authority (HSA) before they can be sold locally. This approach is similar to that found in regulatory systems of developed countries such as

In [31]:
raw_html = query.iloc[0]["content_body"]

print(raw_html)

<div class="ExternalClassF94BD7633A8B46E2A34361B3E5158140"><h2>What Are Health Supplements?</h2><p>Health or dietary supplements refer to a diverse group of products commonly consumed for the purpose of supplementing the diet and enhancing health. These products typically contain ingredients from natural sources and are not meant to prevent, treat, cure or alleviate the symptoms of medical diseases or conditions. </p><p>They usually come in dosage forms such as capsules, soft gels and tablets. Some examples of health supplements include vitamins, minerals (e.g. calcium, iron, magnesium) and herbal supplements (e.g. Echinacea, Guarana).</p><h2>How Are Health Supplements Regulated in Singapore?</h2><p>As ingredients used in health supplements are generally well-established through the experience of safe use and are not intended for medicinal purposes, health supplements do not require approval and are not evaluated by the Health Sciences Authority (HSA) before they can be sold locally. T

### Inspection articles below the lower threshold


In [32]:
df_inspect_lower = df_extracted[
    df_extracted["log_max_paragraph_size"] < lower_threshold
]
df_inspect_lower.sort_values(by="max_paragraph_size", ascending=True, inplace=True)

print(df_inspect_lower.shape)

(57, 10)


In [33]:
display(df_inspect_lower)

Unnamed: 0,id,content_name,title,content_category,content_body,extracted_content_body,has_nested_div,max_paragraph_size,log_max_paragraph_size,percentile_rank
51,1437716,Diabetes (Pocket Guide),Diabetes (Pocket Guide),diseases-and-conditions,"<div class=""ExternalClass86B763FC75F942DCB290A...",What is Type 2 Diabetes?\n\n\nInsulin and Diab...,False,8,2.079442,0.000458
2529,1434636,Korang OK?,Korang OK?,programs,"<div class=""ExternalClassD7E0A54AFAB34654A78FF...",- Home\n- Healthier Choices for Your Family\n-...,True,11,2.397895,0.001374
2288,1434785,Parent Hub: We're Expecting - Resources_resour...,Parent Hub: We're Expecting - Resources,program-sub-pages,"<div class=""ExternalClassF8FB45994C7A41C9B531C...",Resources and Activities\n\nWE'RE EXPECTING\n\...,True,11,2.397895,0.001374
2290,1434869,Parent Hub: Stay Positive - Resources_resource...,Parent Hub: Stay Positive - Resources,program-sub-pages,"<div class=""ExternalClass3ACB5E80C4274896A0956...",Resources and Activities\n\nSTAY POSITIVE\n\nC...,True,11,2.397895,0.001374
2424,1434983,Take the first step with your loved ones_menta...,Take the first step with your loved ones,program-sub-pages,"<div class=""ExternalClass569899BD28494292B8F34...",- Menu\n- Home\n- Healthy Eating\n- Physical A...,True,13,2.564949,0.002289
1416,1445659,Fried Beehoon Chicken Bolognaise,Fried Beehoon Chicken Bolognaise,live-healthy-articles,"<div class=""ExternalClass06D6ACDDCD3947569B774...",Healthy ways with Bee Hoon and Pasta\n(4 Servi...,True,16,2.772589,0.002747
355,1437942,Healthcare Workforce Statistics,Healthcare Workforce Statistics,health-statistics,<p>An overview of Singapore’s healthcare workf...,An overview of Singapores healthcare workforce...,False,17,2.833213,0.003434
2434,1435175,Travelling overseas | Diabetes Hub_travelling-...,Travelling overseas | Diabetes Hub,program-sub-pages,"<div class=""ExternalClass7076CE98F3104E748C6B2...",3 BES TO BEAT DIABETES\nOn this page\n1. How t...,True,17,2.833213,0.003434
123,1437730,Stroke: Areas of the Brain,Stroke: Areas of the Brain,diseases-and-conditions,"<div class=""ExternalClassFACA9B6C97914D2BB5BD8...",About the Brain\nThe brain has two sides (righ...,True,18,2.890372,0.004121
918,1443235,Signs and Symptoms of Colorectal Cancer,Signs and Symptoms of Colorectal Cancer,live-healthy-articles,"<div class=""ExternalClass879B4905FCAE42A88B4DD...",Are You at Risk of Colorectal Cancer?\n- Infla...,False,19,2.944439,0.005723


In [34]:
extracted_text = df_inspect_lower.iloc[0]["extracted_content_body"]

print(extracted_text)

What is Type 2 Diabetes?


Insulin and Diabetes

What Causes Insulin Resistance?

What Causes Insulin Shortage?

Risk Factors for Diabetes

Common Signs of Diabetes

Diabetes Complications

Microvascular Complications

Macrovascular Complications

Complications of Diabetes Management


Hypoglycemia

Hyperglycemia and Sick Day Management

Balancing Your Diet With My Healthy Plate

Eating Tips

What Carbs to Go For

High-Fibre Food

How Much Fruit Can I Eat?

What's 1 Serving of Fruit?

How Much Sugar is in Your Drink?

Healthy Hawker Hacks

Healthier Snacks

Know Your Calories: Chinese New Year (CNY) Goodies

Simple Weight Management Tips

Simple Exercises for Beginners

Principles of Exercise

Risky Lifestyle Choices to Avoid

Diabetes Foot Care

Foot Problems

Foot Care: Good Practices

Foot Care: Harmful Practices

Proper Foot Care

Role of a Podiatrist

Annual Foot and Eye Screening

Treating Diabetes

Things to Note About Medication and Insulin

Suitable Sites for Insulin Injection

In [35]:
raw_html = df_inspect_lower.iloc[0]["content_body"]

print(raw_html)

<div class="ExternalClass86B763FC75F942DCB290A3AFDA0724D0"><h2>What is Type 2 Diabetes?</h2><h3 id="how-insulin-works"></h3><p style="text-align:center;">
      <a href="https://ch-api.healthhub.sg/api/public/content/ab6bc1b564764346b2a865c79c23efa0?v=7e08169d" title="How Insulin Works"> 
         <img src="https://ch-api.healthhub.sg/api/public/content/ab6bc1b564764346b2a865c79c23efa0?v=7e08169d" alt="How Insulin Works" /></a> </p><h3 id="insulin-and-diabetes">Insulin and Diabetes</h3><p style="text-align:center;">
      <a href="https://ch-api.healthhub.sg/api/public/content/992a0d177cdf4eac8d7b5e7f89efad83?v=9203a638" title="Insulin and Diabetes"> 
         <img src="https://ch-api.healthhub.sg/api/public/content/992a0d177cdf4eac8d7b5e7f89efad83?v=9203a638" alt="Insulin and Diabetes" /></a> </p><h3 id="what-causes-insulin-resistance">What Causes Insulin Resistance?</h3><p style="text-align:center;">
      <a href="https://ch-api.healthhub.sg/api/public/content/9a0024fdc2774e0dbbbf06