In [1]:
%load_ext kedro.ipython

In [2]:
import re
import unicodedata

from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
class HTMLExtractor:
    """
    A class to extract and process various elements from HTML content
    using BeautifulSoup.

    Attributes:
        soup (BeautifulSoup): A BeautifulSoup object.
    """

    def __init__(self, html_content: str):
        """
        Initializes the HTMLExtractor with the given HTML content.

        Args:
            html_content (str): The HTML content to be processed.
        """
        self.soup = self.preprocess_html(html_content)

    @classmethod
    def clean_text(cls, text: str) -> str:
        """
        Cleans the given text by normalizing Unicode characters,
        handling special symbols, replacing problematic characters,
        and removing multiple whitespace.

        Args:
            text (str): The input text to be cleaned.

        Returns:
            str: The cleaned text.
        """
        # Normalize Unicode characters
        text = unicodedata.normalize("NFKD", text)
        # Use ASCII encoding to handle special symbols e.g. copyright \xa9
        text = text.encode("ascii", "ignore").decode("utf-8")

        # Replace common problematic characters
        text = text.replace("\xa0", " ")  # non-breaking space
        text = text.replace("\u200b", "")  # zero-width space
        text = text.replace("\u2028", "\n")  # line separator
        text = text.replace("\u2029", "\n")  # paragraph separator

        # Replace multiple whitespace with single space
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    @classmethod
    def preprocess_html(cls, html_content: str) -> BeautifulSoup:
        """
        Preprocesses the given HTML content by replacing all <br>
        tags with newline characters.

        Args:
            html_content (str): The HTML content to be preprocessed.

        Returns:
            BeautifulSoup: The preprocessed HTML content as a BeautifulSoup object.
        """
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all <br> tags and replace them with newline
        for br in soup.find_all("br"):
            br.replace_with("\n")

        return soup

    def extract_related_sections(self) -> list[str]:
        """
        Extracts "Related:" sections and "Read these next:" items from the HTML content.

        Returns:
            list[str]: A list of related sections and "Read these next:" items.
        """
        related_sections = []
        read_these_next_ul = None
        # Extract "Related:" sections and "Read these next:" items
        for tag in self.soup.find_all(["p", "ul"]):
            if tag.name == "p" and tag.find("strong"):
                if "Related:" in tag.text:
                    related_sections.append(
                        re.sub(r"Related: ", "", self.clean_text(tag.text))
                    )
                elif "Read these next:" in tag.text:
                    read_these_next_ul = tag.find_next_sibling("ul")
            elif tag == read_these_next_ul:
                for li in tag.find_all("li"):
                    related_sections.append(self.clean_text(li.text))

        return related_sections

    def extract_text(self) -> str:
        """
        Extracts the main content from the HTML content.

        Returns:
            str: The main content extracted from the HTML content.

        Note:
            This function unwraps the HTML content if it is contained in a <div>. It then extracts the
            main content by iterating over the tags in the soup. The following tags are considered:

                - h1, h2, h3, h4, h5, h6: These tags are treated as key headers and are paragraphed between them.
                - p: This tag is treated as a paragraph. <em> tags are removed from the text.
                    * If the text does not contain sentences about HealthHub app, Google Play, or Apple Store,
                    and it contains a strong tag, it is treated differently based on the text content.
                - ul: This tag is treated as an unordered list. If it is the child of a <div>, it is treated as a list.
                - ol: This tag is treated as an ordered list.
                - div: This tag is treated as a text within a div.

            The extracted content is stored in a list and then processed. Double newlines are replaced with single
            newlines and whitespace is stripped. If the processed text is empty, the function attempts to extract the
            content from the <div> tags.
        """
        # Unwrap if the HTML content is contained in a div
        if self.soup.div is not None:
            self.soup.div.unwrap()

        # Extract the main content
        content = []
        for tag in self.soup.find_all(
            ["h1", "h2", "h3", "h4", "h5", "h6", "div", "p", "ul", "ol"]
        ):
            if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
                # Provide paragraphing between key headers
                content.append("\n")
                content.append(self.clean_text(tag.text))

            elif tag.name == "p":
                # Remove all em tags
                for em in tag.find_all("em"):
                    em.extract()
                # Get the remaining text
                text = tag.get_text()
                # Remove sentences about HealthHub app, Google Play, and Apple Store
                if not re.search(
                    r"(HealthHub app|Google Play|Apple Store|Parent Hub)", text
                ):
                    if tag.find("strong"):
                        if "Related:" in tag.text:
                            text = self.clean_text(tag.text)
                            content.append(re.sub(r"\n", " ", text))
                        elif "Read these next:" in tag.text:
                            content.append(self.clean_text(tag.text))
                    else:
                        content.append(self.clean_text(text))
            # For unordered lists
            elif (
                tag.name == "ul" and tag.parent.name == "div"
            ):  # not "ul" so we avoid duplicates
                for li in tag.find_all("li"):
                    content.append("- " + self.clean_text(li.text))
            # For ordered lists
            elif tag.name == "ol":
                for i, li in enumerate(tag.find_all("li")):
                    content.append(f"{i + 1}. " + self.clean_text(li.text))
            # For texts within div
            elif tag.name == "div":
                content.append(self.clean_text(tag.text))

            content.append("")  # Add a blank line after each element

        # Remove empty strings from content
        content = [c for c in content if c]

        # Replace double newlines with single newlines and strip whitespace
        processed_text = "\n".join(content).replace("\n\n", "\n").strip()

        # # Edge case - HTML content contained in div tags
        # if processed_text.strip() == "":
        #     content = []
        #     # Unwrap if the HTML content is contained in a div
        #     if self.soup.div is not None:
        #         self.soup.div.unwrap()
        #         # For texts within div
        #         for tag in self.soup.find_all("div"):
        #             if tag.name == "div":
        #                 content.append(self.clean_text(tag.text))

        #         # Replace double newlines with single newlines and strip whitespace
        #         processed_text = "\n".join(content).replace("\n\n", "\n").strip()

        return processed_text

    def extract_links(self) -> list[tuple[str, str]]:
        """
        Extracts the title and URL from all the anchor tags in the HTML content.

        Returns:
            list[tuple[str, str]]:
                A list of tuples containing the title and URL of each anchor tag.

        Note:
            Footnotes to references sections are ignored.
        """
        url_records = []

        # Extract title/text and links from anchor tags
        for link in self.soup.find_all("a"):
            url = link.get("href")
            # Ignore footnotes
            if url != "#footnotes":
                text = link.get("title") or link.get_text()
                record = text, url
                url_records.append(record)

        return url_records

    def extract_headers(self) -> list[tuple[str, str]]:
        """
        Extracts the headers from the HTML content.

        Returns:
            list[tuple[str, str]]:
                A list of tuples containing the text and tag name of
                each header found in the HTML content.

        Note:
            References are ignored.
        """
        headers = []

        for title in self.soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
            tag = title.name
            text = title.get_text()
            # Ignore References
            if text != "References":
                record = text, tag
                headers.append(record)

        return headers

In [4]:
# ruff: noqa: F821
catalog.list()


[1m[[0m
    [32m'all_contents'[0m,
    [32m'all_contents_standardized'[0m,
    [32m'all_contents_extracted'[0m,
    [32m'all_extracted_text'[0m,
    [32m'parameters'[0m,
    [32m'params:columns_to_keep'[0m,
    [32m'params:columns_to_keep.cost-and-financing'[0m,
    [32m'params:columns_to_keep.diseases-and-conditions'[0m,
    [32m'params:columns_to_keep.health-statistics'[0m,
    [32m'params:columns_to_keep.live-healthy-articles'[0m,
    [32m'params:columns_to_keep.medical-care-and-facilities'[0m,
    [32m'params:columns_to_keep.medications'[0m,
    [32m'params:columns_to_keep.program-sub-pages'[0m,
    [32m'params:columns_to_keep.programs'[0m,
    [32m'params:columns_to_keep.support-group-and-others'[0m,
    [32m'params:columns_to_add'[0m,
    [32m'params:columns_to_add.health-statistics'[0m,
    [32m'params:columns_to_add.medications'[0m,
    [32m'params:columns_to_add.program-sub-pages'[0m,
    [32m'params:columns_to_add.programs'[0m,
    

In [5]:
# ruff: noqa: F821
all_contents_standardized = catalog.load("all_contents_standardized")
all_contents_standardized


[1m{[0m
    [32m'cost-and-financing'[0m: [1m<[0m[1;95mbound[0m[39m method AbstractVersionedDataset.load of <kedro_datasets.pandas.parquet_dataset.ParquetDataset object at [0m[1;36m0x1581828a0[0m[39m>>,[0m
[39m    [0m[32m'diseases-and-conditions'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.parquet_dataset.ParquetDataset object at [0m[1;36m0x158424320[0m[39m>>,[0m
[39m    [0m[32m'health-statistics'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.parquet_dataset.ParquetDataset object at [0m[1;36m0x158425640[0m[39m>>,[0m
[39m    [0m[32m'live-healthy-articles'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.parquet_dataset.ParquetDataset object at [0m[1;36m0x158425a90[0m[39m>>,[0m
[39m    [0m[32m'medical-care-and-facilities'[0m[39m: <bound method AbstractVersionedDataset.load of <kedro_datasets.pandas.parquet_dataset.ParquetDataset object at [0m[1

In [6]:
content_category = "live-healthy-articles"

df = all_contents_standardized[content_category]()
df

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,last_two_months_view,page_views,engagement_rate,bounce_rate,exit_rate,scroll_percentage,percentage_total_views,cumulative_percentage_total_views,content_category,to_remove
0,1444475,"Weight, BMI and Health Problems","Weight, BMI and Health Problems","Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/weight_p...,www.healthhub.sg/live-healthy/weight_putting_m...,weight_putting_me_at_risk_of_health_problems,What’s your Body Mass Index (BMI)? Learn how t...,"<div class=""ExternalClassE93BEC3784C545A286BB8...",...,132.0,19977,0.690791,0.309209,0.617163,0.362504,0.016884,0.016884,live-healthy-articles,False
1,1445137,7-month-baby Diet: An Authoritative Guide by O...,7-month-baby Diet: An Authoritative Guide by O...,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/meal-ide...,www.healthhub.sg/live-healthy/meal-ideas-month-7,meal-ideas-month-7,Your little one is now 7 months of age. Should...,"<div class=""ExternalClass46E64333542C4D8CBEA23...",...,0.0,18876,0.688392,0.311608,0.894948,0.383635,0.015953,0.032837,live-healthy-articles,False
2,1445282,Older Adults Need More Protein,Older Adults Need More Protein,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/seniors-...,www.healthhub.sg/live-healthy/seniors-need-mor...,seniors-need-more-protein,Did you know that one in two older adults do n...,"<div class=""ExternalClass6C0CAB7C67934853B19AD...",...,0.0,17431,0.609422,0.390578,0.892413,0.326444,0.014732,0.047569,live-healthy-articles,False
3,1445982,Sexual positions and timing of conception,Sexual positions and timing of conception,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/pregnanc...,www.healthhub.sg/live-healthy/pregnancy-sexual...,pregnancy-sexual-positions-and-timing-of-conce...,Is there a best sexual position to adopt for c...,"<div class=""ExternalClass5742A264309B479E84E97...",...,47.0,16309,0.659301,0.340699,0.925131,0.361488,0.013784,0.061353,live-healthy-articles,False
4,1443111,How Much Calories Do I Need A Day?,How Much Calories Do I Need A Day?,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how-much...,www.healthhub.sg/live-healthy/how-much-to-eat-...,how-much-to-eat-at-each-meal,If you're not already aware of your daily calo...,"<div class=""ExternalClass2A6938FBCD87406FAF535...",...,4338.0,12748,0.732515,0.267485,0.774573,0.410123,0.010774,0.072127,live-healthy-articles,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150,1445661,Kashmiri Pulao,Kashmiri Pulao,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/kashmiri...,www.healthhub.sg/live-healthy/kashmiri-pulao,kashmiri-pulao,A healthy brown rice dish with raisins seasone...,"<div class=""ExternalClassDAFE89B7DB53403DBC470...",...,88.0,0,0.000000,1.000000,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False
1151,1468602,1test-1,Test Entry-1-2,,,https://www.healthhub.sg/live-healthy/1test-1,www.healthhub.sg/live-healthy/1test-1,1test-1,Et malesuada fames ac turpis egestas. v1.2.1 T...,"Lorem ipsum dolor sit amet, consectetur adipis...",...,,0,0.000000,1.000000,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,True
1152,1445673,Healthy Food for Kids and Teens,Healthy Food for Kids and Teens,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/A Health...,www.healthhub.sg/live-healthy/A Healthy Food F...,A Healthy Food Foundation - for Kids and Teens,Good nutrition for kids plays an important rol...,"<div class=""ExternalClass7D45F77CD3864AB0BEEA7...",...,1511.0,0,0.000000,1.000000,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False
1153,1445358,How to Study Difficult Subjects,How to Study Difficult Subjects,"Mind and Balance,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how_to_s...,www.healthhub.sg/live-healthy/how_to_study_dif...,how_to_study_difficult_subjects,Your brain is a muscle. You can train it too!,"<div class=""ExternalClass7A329CBEFD0645DE957E4...",...,71.0,0,0.000000,1.000000,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False


### 1. Extract content from HTML


In [7]:
# Initialise new columns in dataframe to store extracted data
df["related_sections"] = None
df["extracted_content_body"] = None
df["extracted_links"] = None
df["extracted_headers"] = None


pbar = tqdm(df.iterrows())

for index, row in pbar:
    # Skip extraction for those articles flagged for removal
    if row["to_remove"]:
        continue

    # Replace all forward slashes with hyphens to avoid saving as folders
    title = re.sub(r"\/", "-", row["title"]).strip()

    # Get the HTML content
    html_content = row["content_body"]

    # Extract text from HTML using the HTMLExtractor Class
    extractor = HTMLExtractor(html_content)
    related_sections = extractor.extract_related_sections()
    extracted_content_body = extractor.extract_text()
    extracted_links = extractor.extract_links()
    extracted_headers = extractor.extract_headers()

    # Store extracted data into the dataframe
    df.at[index, "related_sections"] = related_sections
    df.at[index, "extracted_content_body"] = extracted_content_body
    df.at[index, "extracted_links"] = extracted_links
    df.at[index, "extracted_headers"] = extracted_headers

    # If `extracted_content_body` is empty, we update flag to remove
    if extracted_content_body == "":
        df.at[index, "to_remove"] = True

Extracting HTML content |████████████████████████████████████████| 1155/1155 [10


In [8]:
df

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,exit_rate,scroll_percentage,percentage_total_views,cumulative_percentage_total_views,content_category,to_remove,related_sections,extracted_content_body,extracted_links,extracted_headers
0,1444475,"Weight, BMI and Health Problems","Weight, BMI and Health Problems","Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/weight_p...,www.healthhub.sg/live-healthy/weight_putting_m...,weight_putting_me_at_risk_of_health_problems,What’s your Body Mass Index (BMI)? Learn how t...,"<div class=""ExternalClassE93BEC3784C545A286BB8...",...,0.617163,0.362504,0.016884,0.016884,live-healthy-articles,False,"[BMI Calculator, What is a Healthy Weight?, An...",What's a Healthy Body Mass Index?\nWe have all...,"[(BMI Calculator, https://www.healthhub.sg/pro...","[(What's a Healthy Body Mass Index?, h2), (Why..."
1,1445137,7-month-baby Diet: An Authoritative Guide by O...,7-month-baby Diet: An Authoritative Guide by O...,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/meal-ide...,www.healthhub.sg/live-healthy/meal-ideas-month-7,meal-ideas-month-7,Your little one is now 7 months of age. Should...,"<div class=""ExternalClass46E64333542C4D8CBEA23...",...,0.894948,0.383635,0.015953,0.032837,live-healthy-articles,False,"[Nutrition for Your Toddler, No Wholegrain, No...",By Health Promotion Board in collaboration wit...,"[(Nutrition for Your Toddler, https://www.heal...","[(Recommended Number of Servings (7 months), h..."
2,1445282,Older Adults Need More Protein,Older Adults Need More Protein,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/seniors-...,www.healthhub.sg/live-healthy/seniors-need-mor...,seniors-need-more-protein,Did you know that one in two older adults do n...,"<div class=""ExternalClass6C0CAB7C67934853B19AD...",...,0.892413,0.326444,0.014732,0.047569,live-healthy-articles,False,[],"As you age, your body requires roughly 50% mor...","[(processed meats could increase your risk, ht...","[(How Much Protein do you Need?, h2), (Key Sou..."
3,1445982,Sexual positions and timing of conception,Sexual positions and timing of conception,,https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/pregnanc...,www.healthhub.sg/live-healthy/pregnancy-sexual...,pregnancy-sexual-positions-and-timing-of-conce...,Is there a best sexual position to adopt for c...,"<div class=""ExternalClass5742A264309B479E84E97...",...,0.925131,0.361488,0.013784,0.061353,live-healthy-articles,False,[],Sexual relation is an integral aspect of any h...,"[(Visit Parent Hub, for more useful tips and g...","[(Missionary position , h3), (​Rear entry, h3)..."
4,1443111,How Much Calories Do I Need A Day?,How Much Calories Do I Need A Day?,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how-much...,www.healthhub.sg/live-healthy/how-much-to-eat-...,how-much-to-eat-at-each-meal,If you're not already aware of your daily calo...,"<div class=""ExternalClass2A6938FBCD87406FAF535...",...,0.774573,0.410123,0.010774,0.072127,live-healthy-articles,False,"[Cut 100 Calories From Your Breakfast, Lunch a...",Figuring out Your Daily Calorie Intake Isnt Ro...,"[(, https://www.healthhub.sg/programmes/health...",[(Figuring out Your Daily Calorie Intake Isn’t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150,1445661,Kashmiri Pulao,Kashmiri Pulao,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/kashmiri...,www.healthhub.sg/live-healthy/kashmiri-pulao,kashmiri-pulao,A healthy brown rice dish with raisins seasone...,"<div class=""ExternalClassDAFE89B7DB53403DBC470...",...,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False,[],A fragrant brown rice dish with raisins\n\n\nI...,[],"[(​A fragrant brown rice dish with raisins​, h..."
1151,1468602,1test-1,Test Entry-1-2,,,https://www.healthhub.sg/live-healthy/1test-1,www.healthhub.sg/live-healthy/1test-1,1test-1,Et malesuada fames ac turpis egestas. v1.2.1 T...,"Lorem ipsum dolor sit amet, consectetur adipis...",...,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,True,,,,
1152,1445673,Healthy Food for Kids and Teens,Healthy Food for Kids and Teens,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/A Health...,www.healthhub.sg/live-healthy/A Healthy Food F...,A Healthy Food Foundation - for Kids and Teens,Good nutrition for kids plays an important rol...,"<div class=""ExternalClass7D45F77CD3864AB0BEEA7...",...,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False,[Guilt-Free Treats to Meet Your Child's Nutrit...,Meeting nutritional needs of kids and teenager...,"[(My Healthy Plate, http://www.healthhub.sg/pr...",[(Meeting nutritional needs of kids and teenag...
1153,1445358,How to Study Difficult Subjects,How to Study Difficult Subjects,"Mind and Balance,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/how_to_s...,www.healthhub.sg/live-healthy/how_to_study_dif...,how_to_study_difficult_subjects,Your brain is a muscle. You can train it too!,"<div class=""ExternalClass7A329CBEFD0645DE957E4...",...,0.000000,0.250000,0.000000,1.000000,live-healthy-articles,False,[],So you're pretty happy with the way you've bee...,"[(Visit Parent Hub, for more useful tips and g...","[(Focus on the big picture, h2), (Find what wo..."


In [9]:
df["to_remove"].value_counts()


to_remove
[3;91mFalse[0m    [1;36m1148[0m
[3;92mTrue[0m        [1;36m7[0m
Name: count, dtype: int64

In [10]:
random_state = 42

tmp = df.sample(5, random_state=random_state).reset_index(drop=True)
tmp

Unnamed: 0,id,content_name,title,article_category_names,cover_image_url,full_url,full_url2,friendly_url,category_description,content_body,...,exit_rate,scroll_percentage,percentage_total_views,cumulative_percentage_total_views,content_category,to_remove,related_sections,extracted_content_body,extracted_links,extracted_headers
0,1445429,Know Your Alcohol Limit: Don’t Be a Party Pooper!,Know Your Alcohol Limit: Don’t Be a Party Pooper!,"Sexual Health and Relationships,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/dont_be_...,www.healthhub.sg/live-healthy/dont_be_a_party_...,dont_be_a_party_pooper,Everyone enjoys a night of partying. But what ...,"<div class=""ExternalClassFF709E64E1BD479883D08...",...,0.166023,0.266741,0.000189,0.986617,live-healthy-articles,False,[],The Negative Effects of Alcohol on Friends\nWe...,[],"[(The Negative Effects of Alcohol on Friends, ..."
1,1442458,Help Your Child Untangle From The Web,Help Your Child Untangle From The Web,"Mind and Balance,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/help-you...,www.healthhub.sg/live-healthy/help-your-child-...,help-your-child-untangle-from-the-web,Does your child spend long hours in front of t...,"<div class=""ExternalClassF5D63B3B546A42419922B...",...,0.318612,0.3375,0.000963,0.677177,live-healthy-articles,False,"[Screen Time, Disconnect To Reconnect Why A So...",Singapore teens are among the most Internet-sa...,"[(Screen Time, http://www.babybonus.msf.gov.sg...","[(1. Practise What You Preach, h2), (2. Have C..."
2,1444585,Makan Matters: What‘s a Balanced Diet?,Makan Matters: What‘s a Balanced Diet?,"Food and Nutrition,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/makan-ma...,www.healthhub.sg/live-healthy/makan-matters-wh...,makan-matters-whats-a-healthy-diet,Trying to eat better? Find out what makes a he...,"<div class=""ExternalClassB17BB24F34014C7AAEC3F...",...,0.701918,0.368735,0.002057,0.462499,live-healthy-articles,False,[A Guys Guide to Healthy Eating and Looking Go...,What Makes a Healthy Diet?\nWhen we hear the w...,"[(A Guide to Carbs, /programmes/a-guide-to-car...","[(What Makes a Healthy Diet?, h2), (Find Our B..."
3,1443659,Preparing for Pregnancy: 3 Things to Do Now Th...,Preparing for Pregnancy: 3 Things to Do Now Th...,"Body Care,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/3-things...,www.healthhub.sg/live-healthy/3-things-to-do-n...,3-things-to-do-now-that-youre-pregnant,You've stopped eating raw food and you've also...,"<div class=""ExternalClassA116BA2F83BB43CE9D25E...",...,0.706122,0.346641,0.000868,0.707952,live-healthy-articles,False,[Before The First Antenatal Visit (Choosing Yo...,You've probably started to stay away from raw ...,[(\nBefore The First Antenatal Visit (Choosing...,"[(1. Decide on Your Doctor\n, h2), (2. Choose ..."
4,1444177,Time to Exercise: Aeroplane (in Four-Point Kne...,Time to Exercise: Aeroplane (in Four-Point Kne...,"Exercise & Fitness,",https://ch-api.healthhub.sg/api/public/content...,https://www.healthhub.sg/live-healthy/time-to-...,www.healthhub.sg/live-healthy/time-to-exercise...,time-to-exercise-aeroplane-in-four-point-kneeling,"Hey there mum-to-be, let's do some exercise! G...","<div class=""ExternalClass5A08DDBEC9704AB088788...",...,0.298611,0.268051,0.000234,0.961391,live-healthy-articles,False,"[Mother's Guide to Working Out While Pregnant,...",Exercising during pregnancy comes with many be...,"[(Visit Parent Hub, for more useful tips and g...",[]


In [11]:
index = 1

tmp.iloc[index]["full_url"]

[32m'https://www.healthhub.sg/live-healthy/help-your-child-untangle-from-the-web'[0m

In [12]:
tmp.iloc[index]["related_sections"]


[1m[[0m
    [32m'Screen Time'[0m,
    [32m'Disconnect To Reconnect Why A Social Media Detox Might Be Good For You'[0m,
    [32m'9 Health Hazards of Electronic Devices for Kids'[0m,
    [32m'Dealing with Cyber-Bullies'[0m,
    [32m'It Is Best Your Child Hears It From You'[0m,
    [32m'Dealing With Cyber Bullies'[0m,
    [32m'Parent Hub'[0m,
    [32m'Happy and Lasting Relationships with Children'[0m,
    [32m'Stuck in the Web'[0m
[1m][0m

In [13]:
tmp.iloc[index]["extracted_content_body"]

[32m'Singapore teens are among the most Internet-savvy in the world. Statistics from the Infocomm Development Authority of Singapore show that 99% of teenagers used the Internet in 2014. Their favourite mobile activities are social networking on platforms like Instagram, instant messaging via apps like Snapchat, and sending and receiving emails.\nOnline activities are enjoyable, so it is easy to spend long hours on the computer. It is normal for your child to want to spend more time on activities that fascinate them, and these interests are often good outlets for learning, creativity and self-expression.\nBut when any activity becomes the major focus of their life to the point where it becomes harmful physically, mentally or socially your child may have a cyber addiction problem. A 2010 study[0m[32m[[0m[32m1[0m[32m][0m[32m led by the National Institute of Education and the Media Development Authority found that Singaporean youth spend more time than American adolescents on vid

In [14]:
tmp.iloc[index]["extracted_links"]


[1m[[0m
    [1m([0m
        [32m'Screen Time'[0m,
        [32m'http://www.babybonus.msf.gov.sg/parentingresources/web/Young-Children/YoungChildrenPlay_and_Learning/Screen_Time/Young_Children_Screen_Time'[0m
    [1m)[0m,
    [1m([0m[32m'these ideas'[0m, [32m'https://www.healthhub.sg/live-healthy/ideas-for-an-active-weekend'[0m[1m)[0m,
    [1m([0m
        [32m'Disconnect To Reconnect — Why A Social Media Detox Might Be Good For You'[0m,
        [32m'https://www.healthhub.sg/live-healthy/disconnect-to-reconnect-why-a-social-media-detox-might-be-good-for-you'[0m
    [1m)[0m,
    [1m([0m
        [32m'9 Health Hazards of Electronic Devices for Kids'[0m,
        [32m'https://www.healthhub.sg/live-healthy/9-health-hazards-of-electronic-devices-for-kids'[0m
    [1m)[0m,
    [1m([0m
        [32m'Dealing with Cyber-Bullies'[0m,
        [32m'https://www.healthhub.sg/live-healthy/dealing_with_cyber-bullies'[0m
    [1m)[0m,
    [1m([0m
        [32m'It Is