# Monitoring the quality of the extracted text


## Import relevant libraries


In [1]:
import math

import numpy as np
import plotly.express as px
from bs4 import BeautifulSoup

## Functions to check for nested divs


In [2]:
def check_nested_divs_recursive(element):
    is_nested_div = False

    # Check for nested_divs by comparing child and parent name via recursion
    if element.name == "div" and element.parent.name == "div":
        is_nested_div = True

    # Find all elements
    direct_children = element.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    )
    for child in direct_children:
        check_nested_div = check_nested_divs_recursive(child)
        is_nested_div = is_nested_div or check_nested_div

    return is_nested_div

In [3]:
def check_for_nested_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    has_nested_div = False
    for tag in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    ):
        check_nested_div = check_nested_divs_recursive(tag)
        has_nested_div = has_nested_div or check_nested_div

    # print(f"\nHas Nested div: {has_nested_div}")

    return has_nested_div

In [4]:
def count_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    count = len(soup.find_all("div", recursive=True))

    return count

## Load Merged Data from Kedro Catalog


In [5]:
%load_ext kedro.ipython

In [6]:
# ruff: noqa: F821
catalog.list()


[1m[[0m
    [32m'all_contents'[0m,
    [32m'missing_contents'[0m,
    [32m'all_contents_standardized'[0m,
    [32m'all_contents_added'[0m,
    [32m'all_contents_extracted'[0m,
    [32m'all_extracted_text'[0m,
    [32m'merged_data'[0m,
    [32m'raw_word_counts'[0m,
    [32m'log_word_counts'[0m,
    [32m'flag_for_removal_by_type'[0m,
    [32m'recipes_data'[0m,
    [32m'filtered_data_with_keywords'[0m,
    [32m'parameters'[0m,
    [32m'params:columns_to_keep'[0m,
    [32m'params:columns_to_keep.cost-and-financing'[0m,
    [32m'params:columns_to_keep.diseases-and-conditions'[0m,
    [32m'params:columns_to_keep.health-statistics'[0m,
    [32m'params:columns_to_keep.live-healthy-articles'[0m,
    [32m'params:columns_to_keep.medical-care-and-facilities'[0m,
    [32m'params:columns_to_keep.medications'[0m,
    [32m'params:columns_to_keep.program-sub-pages'[0m,
    [32m'params:columns_to_keep.programs'[0m,
    [32m'params:columns_to_keep.support-g

In [7]:
merged_data = catalog.load("merged_data")

display(merged_data)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,to_remove,remove_type,has_table,has_image,related_sections,extracted_tables,extracted_links,extracted_headers,extracted_images,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/breas...,breast-cancer-screening-subsidies,Here’s all you need to know about breast cance...,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",...,False,,True,False,[Cancer Facts You Cannot Ignore],[[[National Healthcare Group (NHG) Polyclinics...,"[[Cancer Facts You Cannot Ignore, https://www....","[[Breast Cancer Screening, h2], [Subsidy for M...",[],Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/marri...,marriage_parenthood_scheme,New parents and couples looking to conceive ca...,"b'<div class=""ExternalClassE1D82270F17241E4955...",...,False,,True,False,"[MediSave, Baby Bonus - What You Need to Know,...","[[[Delivery Procedure, No. of Days of Hospital...","[[How to Submit Claims, https://crms.moh.gov.s...","[[MediSave Maternity Package, h2], [Examples o...",[],MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/medisave,medisave,MediSave is the national medical savings schem...,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...",...,False,,False,False,[A Way to Reduce Outpatient Cost for Chronic D...,,[[A Way to Reduce Outpatient Cost for Chronics...,"[[, h2], [What is MediSave?, h2], [Contributin...",[],"What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/finan...,financial-assistance-for-local-patients-in-sin...,Having trouble paying your medical bill? Here’...,"b'<div class=""ExternalClassE335708125E743FDAA3...",...,False,,False,False,[],,"[[Medifund, https://www.healthhub.sg/a-z/costs...","[[Medifund: Hospital Bill Payment Assistance, ...",[],Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/chas,chas,"With a CHAS card, all Singapore citizens can r...",b'<h2>What is the Community Health Assist Sche...,...,False,,True,True,[All Pioneer Generation (PG) and Merdeka Gener...,"[[[CHAS Criteria, CHAS Green, CHAS Orange, CHA...","[[www.chas.sg/apply, https://www.chas.sg/apply...",[[What is the Community Health Assist Scheme (...,"[[chas blue card, https://ch-api.healthhub.sg/...",What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,transitional-care-programme-for-heart-failure,The team from NUHCS gives support to heart fai...,"b'<div class=""ExternalClassFC126593610D4F0587A...",...,False,,False,True,[],,"[[charles_wu@nuhs.edu.sg, mailto: charles_wu@n...",[[Transitional Care for Heart Failure Patients...,"[[Transitional care, amongst other things, pro...",Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,2015-NNI-support-group,Read on for a list of brain injury support gro...,"b'<div class=""ExternalClass7C92735B78174928B28...",...,False,,False,False,[],,"[[here, https://www.nni.com.sg/patient-care/br...","[[, h2], [Brain Tumour Society (Singapore), h2...",[],Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,ambulatory-nutrition-support,Read about the ambulatory support benefits one...,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",...,False,,False,False,[],,"[[Nutrition and Dietetics, https://www.healthh...",[[The Importance of Ambulatory Nutrition Suppo...,[],The Importance of Ambulatory Nutrition Support...
2611,1440766,LapBandits Support Group (Singapore),LapBandits Support Group (Singapore),"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,singapore-lapbandits-support-group,Have you just undergone bariatric surgery for ...,"b'<div class=""ExternalClassA4C749C7DB7647FBB6D...",...,True,Below Word Count,False,False,[],,[],[[About Khoo Teck Puat Hospitals LapBandits Su...,[],About Khoo Teck Puat Hospitals LapBandits Supp...


## Keep only rows where `to_remove` is False


In [8]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,to_remove,remove_type,has_table,has_image,related_sections,extracted_tables,extracted_links,extracted_headers,extracted_images,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/breas...,breast-cancer-screening-subsidies,Here’s all you need to know about breast cance...,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",...,False,,True,False,[Cancer Facts You Cannot Ignore],[[[National Healthcare Group (NHG) Polyclinics...,"[[Cancer Facts You Cannot Ignore, https://www....","[[Breast Cancer Screening, h2], [Subsidy for M...",[],Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/marri...,marriage_parenthood_scheme,New parents and couples looking to conceive ca...,"b'<div class=""ExternalClassE1D82270F17241E4955...",...,False,,True,False,"[MediSave, Baby Bonus - What You Need to Know,...","[[[Delivery Procedure, No. of Days of Hospital...","[[How to Submit Claims, https://crms.moh.gov.s...","[[MediSave Maternity Package, h2], [Examples o...",[],MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/medisave,medisave,MediSave is the national medical savings schem...,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...",...,False,,False,False,[A Way to Reduce Outpatient Cost for Chronic D...,,[[A Way to Reduce Outpatient Cost for Chronics...,"[[, h2], [What is MediSave?, h2], [Contributin...",[],"What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/finan...,financial-assistance-for-local-patients-in-sin...,Having trouble paying your medical bill? Here’...,"b'<div class=""ExternalClassE335708125E743FDAA3...",...,False,,False,False,[],,"[[Medifund, https://www.healthhub.sg/a-z/costs...","[[Medifund: Hospital Bill Payment Assistance, ...",[],Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/chas,chas,"With a CHAS card, all Singapore citizens can r...",b'<h2>What is the Community Health Assist Sche...,...,False,,True,True,[All Pioneer Generation (PG) and Merdeka Gener...,"[[[CHAS Criteria, CHAS Green, CHAS Orange, CHA...","[[www.chas.sg/apply, https://www.chas.sg/apply...",[[What is the Community Health Assist Scheme (...,"[[chas blue card, https://ch-api.healthhub.sg/...",What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,Breast-cancer-support-group-singapore,Learn how breast cancer support groups help br...,"b'<div class=""ExternalClassE9019B25CD2A40F3948...",...,False,,False,False,[],,"[[here, https://www.ncis.com.sg/events/Pages/E...","[[Breast Cancer Support at NCIS, h2], [Our Mis...",[],"NCIS Breast Support Group provides knowledge, ..."
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,transitional-care-programme-for-heart-failure,The team from NUHCS gives support to heart fai...,"b'<div class=""ExternalClassFC126593610D4F0587A...",...,False,,False,True,[],,"[[charles_wu@nuhs.edu.sg, mailto: charles_wu@n...",[[Transitional Care for Heart Failure Patients...,"[[Transitional care, amongst other things, pro...",Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,2015-NNI-support-group,Read on for a list of brain injury support gro...,"b'<div class=""ExternalClass7C92735B78174928B28...",...,False,,False,False,[],,"[[here, https://www.nni.com.sg/patient-care/br...","[[, h2], [Brain Tumour Society (Singapore), h2...",[],Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,ambulatory-nutrition-support,Read about the ambulatory support benefits one...,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",...,False,,False,False,[],,"[[Nutrition and Dietetics, https://www.healthh...",[[The Importance of Ambulatory Nutrition Suppo...,[],The Importance of Ambulatory Nutrition Support...


## Keep only relevant Content Categories


In [9]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,to_remove,remove_type,has_table,has_image,related_sections,extracted_tables,extracted_links,extracted_headers,extracted_images,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/breas...,breast-cancer-screening-subsidies,Here’s all you need to know about breast cance...,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",...,False,,True,False,[Cancer Facts You Cannot Ignore],[[[National Healthcare Group (NHG) Polyclinics...,"[[Cancer Facts You Cannot Ignore, https://www....","[[Breast Cancer Screening, h2], [Subsidy for M...",[],Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/marri...,marriage_parenthood_scheme,New parents and couples looking to conceive ca...,"b'<div class=""ExternalClassE1D82270F17241E4955...",...,False,,True,False,"[MediSave, Baby Bonus - What You Need to Know,...","[[[Delivery Procedure, No. of Days of Hospital...","[[How to Submit Claims, https://crms.moh.gov.s...","[[MediSave Maternity Package, h2], [Examples o...",[],MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,MediSave,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/medisave,medisave,MediSave is the national medical savings schem...,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...",...,False,,False,False,[A Way to Reduce Outpatient Cost for Chronic D...,,[[A Way to Reduce Outpatient Cost for Chronics...,"[[, h2], [What is MediSave?, h2], [Contributin...",[],"What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/finan...,financial-assistance-for-local-patients-in-sin...,Having trouble paying your medical bill? Here’...,"b'<div class=""ExternalClassE335708125E743FDAA3...",...,False,,False,False,[],,"[[Medifund, https://www.healthhub.sg/a-z/costs...","[[Medifund: Hospital Bill Payment Assistance, ...",[],Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,"Alerts and Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/costs-and-financi...,www.healthhub.sg/a-z/costs-and-financing/chas,chas,"With a CHAS card, all Singapore citizens can r...",b'<h2>What is the Community Health Assist Sche...,...,False,,True,True,[All Pioneer Generation (PG) and Merdeka Gener...,"[[[CHAS Criteria, CHAS Green, CHAS Orange, CHA...","[[www.chas.sg/apply, https://www.chas.sg/apply...",[[What is the Community Health Assist Scheme (...,"[[chas blue card, https://ch-api.healthhub.sg/...",What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,Breast Cancer Support Groups,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,Breast-cancer-support-group-singapore,Learn how breast cancer support groups help br...,"b'<div class=""ExternalClassE9019B25CD2A40F3948...",...,False,,False,False,[],,"[[here, https://www.ncis.com.sg/events/Pages/E...","[[Breast Cancer Support at NCIS, h2], [Our Mis...",[],"NCIS Breast Support Group provides knowledge, ..."
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,transitional-care-programme-for-heart-failure,The team from NUHCS gives support to heart fai...,"b'<div class=""ExternalClassFC126593610D4F0587A...",...,False,,False,True,[],,"[[charles_wu@nuhs.edu.sg, mailto: charles_wu@n...",[[Transitional Care for Heart Failure Patients...,"[[Transitional care, amongst other things, pro...",Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,"Alerts & Advisories,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,2015-NNI-support-group,Read on for a list of brain injury support gro...,"b'<div class=""ExternalClass7C92735B78174928B28...",...,False,,False,False,[],,"[[here, https://www.nni.com.sg/patient-care/br...","[[, h2], [Brain Tumour Society (Singapore), h2...",[],Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,"Conditions and Illnesses,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/a-z/support-groups-an...,www.healthhub.sg/a-z/support-groups-and-others...,ambulatory-nutrition-support,Read about the ambulatory support benefits one...,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",...,False,,False,False,[],,"[[Nutrition and Dietetics, https://www.healthh...",[[The Importance of Ambulatory Nutrition Suppo...,[],The Importance of Ambulatory Nutrition Support...


## Keep Relevant Columns


In [10]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "full_url",
        "title",
        "content_category",
        "content_body",
        "extracted_content_body",
    ]
]

display(df_extracted)

Unnamed: 0,id,content_name,full_url,title,content_category,content_body,extracted_content_body
0,1435040,Breast Screening Subsidies in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Breast Screening Subsidies in Singapore,cost-and-financing,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",Breast cancer is the number one cancer among w...
1,1435071,Marriage and Parenthood Schemes,https://www.healthhub.sg/a-z/costs-and-financi...,Marriage and Parenthood Schemes,cost-and-financing,"b'<div class=""ExternalClassE1D82270F17241E4955...",MediSave Maternity Package\nWith the MediSave ...
2,1434993,MediSave,https://www.healthhub.sg/a-z/costs-and-financi...,MediSave,cost-and-financing,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...","What is MediSave?\nMediSave, introduced in Apr..."
3,1435031,Hospital Bills Financial Assistance in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"b'<div class=""ExternalClassE335708125E743FDAA3...",Patients or family members who have difficulty...
4,1435043,Community Health Assist Scheme (CHAS) Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Community Health Assist Scheme (CHAS) Singapore,cost-and-financing,b'<h2>What is the Community Health Assist Sche...,What is the Community Health Assist Scheme (CH...
...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,https://www.healthhub.sg/a-z/support-groups-an...,Breast Cancer Support Groups,support-group-and-others,"b'<div class=""ExternalClassE9019B25CD2A40F3948...","NCIS Breast Support Group provides knowledge, ..."
2608,1440763,Heart Failure Transitional Care Programme,https://www.healthhub.sg/a-z/support-groups-an...,Heart Failure Transitional Care Programme,support-group-and-others,"b'<div class=""ExternalClassFC126593610D4F0587A...",Heart failure is the leading cause of rehospit...
2609,1440791,Brain and Head Injury Support Groups,https://www.healthhub.sg/a-z/support-groups-an...,Brain and Head Injury Support Groups,support-group-and-others,"b'<div class=""ExternalClass7C92735B78174928B28...",Brain Tumour Society (Singapore)\nThe Brain Tu...
2610,1440768,Ambulatory Nutrition Support,https://www.healthhub.sg/a-z/support-groups-an...,Ambulatory Nutrition Support,support-group-and-others,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",The Importance of Ambulatory Nutrition Support...


In [11]:
print(df_extracted.groupby(["content_category"])["id"].count())

content_category
cost-and-financing               23
diseases-and-conditions         310
live-healthy-articles          1107
medical-care-and-facilities      57
support-group-and-others         12
Name: id, dtype: int64


## Count the number of div tags in article


In [12]:
df_extracted["div_count"] = df_extracted.apply(
    lambda row: count_divs(row["content_body"]), axis=1
)

In [13]:
display(df_extracted)

Unnamed: 0,id,content_name,full_url,title,content_category,content_body,extracted_content_body,div_count
0,1435040,Breast Screening Subsidies in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Breast Screening Subsidies in Singapore,cost-and-financing,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",Breast cancer is the number one cancer among w...,11
1,1435071,Marriage and Parenthood Schemes,https://www.healthhub.sg/a-z/costs-and-financi...,Marriage and Parenthood Schemes,cost-and-financing,"b'<div class=""ExternalClassE1D82270F17241E4955...",MediSave Maternity Package\nWith the MediSave ...,1
2,1434993,MediSave,https://www.healthhub.sg/a-z/costs-and-financi...,MediSave,cost-and-financing,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...","What is MediSave?\nMediSave, introduced in Apr...",19
3,1435031,Hospital Bills Financial Assistance in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Hospital Bills Financial Assistance in Singapore,cost-and-financing,"b'<div class=""ExternalClassE335708125E743FDAA3...",Patients or family members who have difficulty...,11
4,1435043,Community Health Assist Scheme (CHAS) Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,Community Health Assist Scheme (CHAS) Singapore,cost-and-financing,b'<h2>What is the Community Health Assist Sche...,What is the Community Health Assist Scheme (CH...,0
...,...,...,...,...,...,...,...,...
2607,1440774,Breast Cancer Support Groups,https://www.healthhub.sg/a-z/support-groups-an...,Breast Cancer Support Groups,support-group-and-others,"b'<div class=""ExternalClassE9019B25CD2A40F3948...","NCIS Breast Support Group provides knowledge, ...",1
2608,1440763,Heart Failure Transitional Care Programme,https://www.healthhub.sg/a-z/support-groups-an...,Heart Failure Transitional Care Programme,support-group-and-others,"b'<div class=""ExternalClassFC126593610D4F0587A...",Heart failure is the leading cause of rehospit...,1
2609,1440791,Brain and Head Injury Support Groups,https://www.healthhub.sg/a-z/support-groups-an...,Brain and Head Injury Support Groups,support-group-and-others,"b'<div class=""ExternalClass7C92735B78174928B28...",Brain Tumour Society (Singapore)\nThe Brain Tu...,136
2610,1440768,Ambulatory Nutrition Support,https://www.healthhub.sg/a-z/support-groups-an...,Ambulatory Nutrition Support,support-group-and-others,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",The Importance of Ambulatory Nutrition Support...,28


In [None]:
div_count_threshold = 5

fig = px.histogram(df_extracted, x="div_count", nbins=40)
fig.update_layout(
    title_text="Article div count distribution",
    xaxis_title_text="Div count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.add_vline(x=div_count_threshold, line_dash="dash", line_color="firebrick")
# fig.add_hline(y=100, line_dash="dash", line_color="red")
fig.show()

In [None]:
div_counts_by_category = (
    df_extracted[df_extracted["div_count"] >= div_count_threshold]
    .groupby(["content_category"])["div_count"]
    .count()
)

print(div_counts_by_category)

## Check for Nested divs in article


In [None]:
df_extracted["has_nested_div"] = df_extracted.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

display(df_extracted)

In [None]:
print(df_extracted["has_nested_div"].value_counts())

In [None]:
df_extracted[
    (df_extracted["div_count"] <= div_count_threshold)
    & (df_extracted["has_nested_div"])
]

In [None]:
nested_divs_by_category = (
    df_extracted[
        (df_extracted["div_count"] <= div_count_threshold)
        & (df_extracted["has_nested_div"])
    ]
    .groupby(["content_category"])["has_nested_div"]
    .count()
)

print(nested_divs_by_category)

## Flag articles by div_count and has_nested_divs


In [None]:
def flag_articles(df, threshold):
    if "flagged" not in df.columns:
        df["flagged"] = False
        df["type"] = None

    for ind, row in df.iterrows():
        if row["div_count"] >= threshold:
            df.at[ind, "flagged"] = True
            df.at[ind, "type"] = "div count exceeds threshold"

        if row["has_nested_div"]:
            df.at[ind, "flagged"] = True
            if df.at[ind, "type"] is None:
                df.at[ind, "type"] = "has nested div"
            else:
                df.at[ind, "type"] = df.at[ind, "type"] + ", " + "has nested div"

    return df

In [None]:
df_flagged = flag_articles(df_extracted, div_count_threshold).sort_values(
    by="div_count", ascending=False
)

In [None]:
display(df_flagged[df_flagged["flagged"]])

In [None]:
flagged_by_category = (
    df_flagged[df_flagged["flagged"]].groupby(["content_category"])["flagged"].count()
)

print(flagged_by_category)

In [None]:
# df_flagged.to_excel("flagged_articles.xlsx")

## Inspecting for Poor Text Extraction


### Using the Word Count of the Largest Paragraph in each Article as a Heuristic


In [None]:
def max_paragraph_size(row):
    article = row["extracted_content_body"]
    if article is None:
        return 0

    # print(type(article))
    paragraphs = article.split("\n")
    if paragraphs is None:
        return 0

    max_paragraph_size = 0
    for paragraph in paragraphs:
        paragraph_size = len(paragraph.split())
        max_paragraph_size = max(max_paragraph_size, paragraph_size)

    return max_paragraph_size

In [None]:
df_extracted["max_paragraph_size"] = df_extracted.apply(max_paragraph_size, axis=1)

In [None]:
display(df_extracted)

### Visualising the `Max Paragraph Size` of each article (Log-normal)


In [None]:
fig = px.histogram(df_extracted, x="max_paragraph_size", nbins=100)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

#### Inspecting the tails of the distribution


In [None]:
print(
    df_extracted["max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

### Visualising the Normal Distribution using `log_max_paragraph_size`


In [None]:
df_extracted["log_max_paragraph_size"] = np.log(df_extracted["max_paragraph_size"])

#### Setting the Lower & Upper Threshold

In [None]:
lower_threshold = np.log(
    math.ceil(df_extracted["max_paragraph_size"].quantile([0.025]).values[0] / 10) * 10
)
print(lower_threshold)

In [None]:
upper_threshold = np.log(
    math.floor(df_extracted["max_paragraph_size"].quantile([0.975]).values[0] / 10) * 10
)
print(upper_threshold)

In [None]:
fig = px.histogram(df_extracted, x="log_max_paragraph_size", nbins=100)
fig.update_layout(
    title_text=f"log(word_count) distribution for Paragraph: {upper_threshold}",
    xaxis_title_text="log(word_count)",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
# Presentation of Percentile for 0, 1, 2, 3 std dev from mean
print(
    df_extracted["log_max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

#### Adding the `percentile` rank for each record


In [None]:
df_extracted["percentile_rank"] = df_extracted["log_max_paragraph_size"].rank(
    pct=True, ascending=True
)

In [None]:
display(df_extracted)

### Inspection articles higher than the upper threshold


In [None]:
df_inspect_higher = df_extracted[
    df_extracted["log_max_paragraph_size"] > upper_threshold
]
df_inspect_higher.sort_values(by="max_paragraph_size", ascending=False, inplace=True)

print(df_inspect_higher.shape)

In [None]:
display(df_inspect_higher)

In [None]:
fig = px.histogram(df_inspect_higher, x="max_paragraph_size", nbins=50)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

In [None]:
# Export file to Excel

# df_inspect_higher.to_excel("inspection.xlsx")

#### Remove articles with nested div containers


In [None]:
df_inspect_higher["has_nested_div"].value_counts()

In [None]:
df_inspect_higher["has_nested_div"] = df_inspect_higher.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

df_no_nested_divs = df_inspect_higher[~df_inspect_higher["has_nested_div"]]
display(
    df_no_nested_divs.sort_values(
        by=["max_paragraph_size"], ascending=False, inplace=False
    )
)

In [None]:
query = df_no_nested_divs
extracted_text = query.iloc[0]["extracted_content_body"]

print(extracted_text)

In [None]:
raw_html = query.iloc[0]["content_body"]

print(raw_html)

### Inspection articles below the lower threshold


In [None]:
df_inspect_lower = df_extracted[
    df_extracted["log_max_paragraph_size"] < lower_threshold
]
df_inspect_lower.sort_values(by="max_paragraph_size", ascending=True, inplace=True)

print(df_inspect_lower.shape)

In [None]:
display(df_inspect_lower)

In [None]:
extracted_text = df_inspect_lower.iloc[0]["extracted_content_body"]

print(extracted_text)

In [None]:
raw_html = df_inspect_lower.iloc[0]["content_body"]

print(raw_html)