In [1]:
%load_ext kedro.ipython

In [2]:
import pandas as pd

In [3]:
def flag_duplicated(
    df: pd.DataFrame, whitelist: list[int], column: str
) -> pd.DataFrame:
    """
    Flags duplicated rows in the given DataFrame based on the specified column.
    This function only inspects for duplicates in two columns:
    `extracted_content_body` and `full_url`.

    Args:
        df (pd.DataFrame): The DataFrame to flag duplicated rows in.
        whitelist (list[int]): The list of article IDs to keep. See https://bitly.cx/IlwNV.
        column (str):
            The column to check for duplicated values. Must be either
            `extracted_content_body` and `full_url`.

    Returns:
        pd.DataFrame:
            The DataFrame with a new column `to_remove` indicating whether a row
            should be removed. The `remove_type` column is also updated with the type of
            "Duplicated Content" or "Duplicated URL".

    Raises:
        AssertionError: If the `column` parameter is None or not valid.
    """
    assert column is not None, "`column` cannot be None"
    assert column in ["extracted_content_body", "full_url"], "Invalid column"

    if column == "extracted_content_body":
        duplicated_df = df[
            (df[column].duplicated(keep="first"))  # we want duplicated articles
            & (df[column].notna())  # ignore null values
            & (df[column] != "")  # ignore empty extracted content
            # & (~df["to_remove"])  # ignore articles that were already flagged
        ]
        value = "Duplicated Content"

    elif column == "full_url":
        duplicated_df = df[
            (df[column].duplicated(keep="first"))  # we want duplicated URLs
            & (df[column].notna())  # ignore null values
            # & (~df["to_remove"])  # ignore articles that were already flagged
        ]
        value = "Duplicated URL"

    for i in range(len(duplicated_df)):
        # Get all indexes for duplicated content or URL
        duplicated_indexes = df[df[column] == duplicated_df.iloc[i][column]].index

        # Note: We could simply update all at once at `duplicated_indexes`
        # However, we'd overwrite the previous flags. This is kept as is, for now.
        for j in duplicated_indexes:
            if not df.iloc[j]["to_remove"]:
                # Ignore whitelisted articles
                if df.iloc[j]["id"] in whitelist:
                    continue
                # Update `to_remove`
                df.at[j, "to_remove"] = True

                # Set `remove_type` for all indexes (either "Duplicated Content" or "Duplicated URL")
                df.at[j, "remove_type"] = value

    return df

In [4]:
# ruff: noqa: F821
merged_data = catalog.load("merged_data")

df_keep = merged_data[
    [
        "id",
        "content_name",
        "title",
        "full_url",
        "content_body",
        "extracted_content_body",
        "pr_name",
        "date_modified",
        "content_category",
        "page_views",
        "to_remove",
        "remove_type",
    ]
]

display(df_keep)

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
0,1435040,Breast Screening Subsidies in Singapore,Breast Screening Subsidies in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,"b'<div class=""ExternalClass07C58E0D957B4AA7B14...",Breast cancer is the number one cancer among w...,Health Promotion Board,2023-08-08T05:27:42.0000000Z,cost-and-financing,10855,False,
1,1435071,Marriage and Parenthood Schemes,Marriage and Parenthood Schemes,https://www.healthhub.sg/a-z/costs-and-financi...,"b'<div class=""ExternalClassE1D82270F17241E4955...",MediSave Maternity Package\nWith the MediSave ...,Ministry of Health,2021-11-02T05:46:52.0000000Z,cost-and-financing,5581,False,
2,1434993,MediSave,MediSave,https://www.healthhub.sg/a-z/costs-and-financi...,"b'<div class=""ExternalClass67AD25F1F8B64B349E5...","What is MediSave?\nMediSave, introduced in Apr...",Ministry of Health,2019-01-29T05:06:22.0000000Z,cost-and-financing,3205,False,
3,1435031,Hospital Bills Financial Assistance in Singapore,Hospital Bills Financial Assistance in Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,"b'<div class=""ExternalClassE335708125E743FDAA3...",Patients or family members who have difficulty...,Khoo Teck Puat Hospital,2019-09-13T03:08:41.0000000Z,cost-and-financing,3077,False,
4,1435043,Community Health Assist Scheme (CHAS) Singapore,Community Health Assist Scheme (CHAS) Singapore,https://www.healthhub.sg/a-z/costs-and-financi...,b'<h2>What is the Community Health Assist Sche...,What is the Community Health Assist Scheme (CH...,CHAS,2022-09-30T10:44:05.0000000Z,cost-and-financing,3026,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
2608,1440763,Heart Failure Transitional Care Programme,Heart Failure Transitional Care Programme,https://www.healthhub.sg/a-z/support-groups-an...,"b'<div class=""ExternalClassFC126593610D4F0587A...",Heart failure is the leading cause of rehospit...,"National University Heart Centre, Singapore",2021-12-21T03:07:24.0000000Z,support-group-and-others,302,False,
2609,1440791,Brain and Head Injury Support Groups,Brain and Head Injury Support Groups,https://www.healthhub.sg/a-z/support-groups-an...,"b'<div class=""ExternalClass7C92735B78174928B28...",Brain Tumour Society (Singapore)\nThe Brain Tu...,National Neuroscience Institute,2019-09-13T02:35:52.0000000Z,support-group-and-others,291,False,
2610,1440768,Ambulatory Nutrition Support,Ambulatory Nutrition Support,https://www.healthhub.sg/a-z/support-groups-an...,"b'<div class=""ExternalClass3FABAC9D59A64BCAB96...",The Importance of Ambulatory Nutrition Support...,National University Hospital,2021-05-25T02:27:00.0000000Z,support-group-and-others,258,False,
2611,1440766,LapBandits Support Group (Singapore),LapBandits Support Group (Singapore),https://www.healthhub.sg/a-z/support-groups-an...,"b'<div class=""ExternalClassA4C749C7DB7647FBB6D...",About Khoo Teck Puat Hospitals LapBandits Supp...,Khoo Teck Puat Hospital,2020-11-02T04:38:01.0000000Z,support-group-and-others,247,True,Below Word Count


In [5]:
df_flagged_all = df_keep[
    (df_keep["remove_type"] == "Duplicated Content")
    | (df_keep["remove_type"] == "Duplicated URL")
].sort_values(["full_url", "extracted_content_body"])

display(df_flagged_all)  # ruff: noqa: F821

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
1446,1495949,conversations-about-vaping,Parenting Insights: Strategies for Conversatio...,https://www.healthhub.sg/live-healthy/conversa...,"b'<div class=""ExternalClassB1D1BA8198604AF5897...",Synopsis: Learn proactive parenting strategies...,,,live-healthy-articles,29,True,Duplicated Content
1067,1445629,Sliced Fish with Bee Hoon Soup,Sliced Fish with Bee Hoon Soup,https://www.healthhub.sg/live-healthy/fish-bee...,"b'<div class=""ExternalClassF5C1DD3FA7E84963A88...",Mouthwatering sliced fish with bee hoon soup\n...,Health Promotion Board,2022-11-15T08:35:26.0000000Z,live-healthy-articles,378,True,Duplicated URL
1215,1443608,Mee Goreng,Mee Goreng,https://www.healthhub.sg/live-healthy/mee-goreng,"b'<div class=""ExternalClass60865CF1F8FA4603ABA...",By KK Womens and Childrens Hospital and Ms Hen...,KK Women's and Children's Hospital,2021-12-21T08:10:47.0000000Z,live-healthy-articles,276,True,Duplicated URL
1515,1445972,"Eat Well, Mum","Eat Well, Mum",https://www.healthhub.sg/live-healthy/parents-...,"b'<h2><img alt=""PY-Marina-Bay-Selfie_and_entry...",Physical Activity Fun Both in and Out of the S...,Health Promotion Board,2022-11-15T08:46:52.0000000Z,live-healthy-articles,0,True,Duplicated Content
1370,1445829,Recipe : Sayur Lodeh,Recipe : Sayur Lodeh,https://www.healthhub.sg/live-healthy/sayur-lodeh,"b'<div class=""ExternalClass8FB2510AE8C541EEADF...",Looking to bring some wholesome goodness home?...,Health Promotion Board,2022-11-15T08:35:29.0000000Z,live-healthy-articles,187,True,Duplicated URL
...,...,...,...,...,...,...,...,...,...,...,...,...
2242,1434769,Parent Hub: 7-12 Years - Healthy Eating_health...,Parent Hub: 7-12 Years - Healthy Eating,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClass0A5C5A7F37E8484C914...",Wed love to hear from you!,Health Promotion Board,,program-sub-pages,1384,True,Duplicated Content
2232,1434774,Parent Hub: 7-12 Years - Sparkly Teeth and Eye...,Parent Hub: 7-12 Years - Sparkly Teeth and Eyes,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClass679540320FCF4DE5817...",Wed love to hear from you!,Health Promotion Board,,program-sub-pages,1644,True,Duplicated Content
2296,1434773,Parent Hub: 7-12 Years - Healthy at School_sch...,Parent Hub: 7-12 Years - Healthy at School,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClass2770EF056B0A4A02B2B...",Wed love to hear from you!,Health Promotion Board,,program-sub-pages,562,True,Duplicated Content
2332,1434838,Parent Hub: Teens - View All_all-items-adolesc...,Parent Hub: Teens - View All,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassCD0CD6B849DE4FF198A...",Wed love to hear from you!,Health Promotion Board,,program-sub-pages,303,True,Duplicated Content


In [6]:
print(df_flagged_all["remove_type"].value_counts())

remove_type
Duplicated Content    61
Duplicated URL         5
Name: count, dtype: int64


In [7]:
whitelist = catalog.load("params:whitelist")  # ruff: noqa: F821

print(whitelist)

[1445216, 1444496, 1446090, 1442907, 1443325, 1445019, 1442928, 1445021, 1444996, 1442952, 1445017, 1445212, 1445958, 1444997, 1445027, 1445024, 1445002, 1444991, 1445000, 1445733, 1445704, 1445707, 1497409, 1469472, 1446081, 1445828, 1445798, 1435335, 1435183, 1434614]


In [8]:
blacklist = catalog.load("params:blacklist")  # ruff: noqa: F821

print(blacklist)

{1443526: 'Table of Contents', 1443534: 'Table of Contents', 1444997: 'Irrelevant Content', 1445002: 'Irrelevant Content', 1444991: 'Irrelevant Content', 1444996: 'Irrelevant Content', 1445000: 'Irrelevant Content'}


In [9]:
df_keep = flag_duplicated(df_keep.copy(), [], column="extracted_content_body")
df_duplicated_content = df_keep[
    (df_keep["remove_type"] == "Duplicated Content")
].sort_values(["title", "date_modified", "extracted_content_body", "page_views"])

display(df_duplicated_content)  # ruff: noqa: F821

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
2365,1473807,sleep-test-2,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2367,1473810,sleep-test-2-change-css,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2370,1473813,sleep-test-3-css-change-again,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2371,1473816,sleep-test-dup,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2520,1434652,3 Be's To Beat Diabetes | Diabetes Hub,3 Be's To Beat Diabetes | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-hub,"b'<div class=""ExternalClassFAC31D10071B445C93D...",3 BES TO BEAT DIABETES \n1 Be Aware\nThe more ...,Ministry of Health,,programs,20626,True,Duplicated Content
...,...,...,...,...,...,...,...,...,...,...,...,...
2458,1435245,Persona B_persona-b_Level1,Persona B,https://www.healthhub.sg/programmes/ga-testing...,"b'<div class=""ExternalClassEC86420A61A64D089B7...",a-,Health Promotion Board,,program-sub-pages,0,True,Duplicated Content
2481,1435160,Setting goals | Diabetes Hub_setting-goals_Level1,Setting goals | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,"b'<div class=""ExternalClass1EE8A09B31F6418C97F...",3 BES TO BEAT DIABETES \nOn this page\n1. Over...,Ministry of Health,,program-sub-pages,0,True,Duplicated Content
2423,1434979,Take the first step with your loved ones_healt...,Take the first step with your loved ones,https://www.healthhub.sg/programmes/indian_out...,"b'<div class=""ExternalClass25A0E1C4C944475AA3B...",- Menu\n- Home\n- Healthy Eating\n- Physical A...,Health Promotion Board,,program-sub-pages,0,True,Duplicated Content
2304,1434977,Take the first step with your loved ones_physi...,Take the first step with your loved ones,https://www.healthhub.sg/programmes/howareyoud...,"b'<div class=""ExternalClass37ECDF16831C42DFB97...",- Menu\n- Home\n- Healthy Eating\n- Physical A...,Health Promotion Board,,program-sub-pages,519,True,Duplicated Content


In [10]:
df_duplicates = flag_duplicated(df_keep.copy(), whitelist, column="extracted_content_body")
df_duplicated_content = df_duplicates[
    (df_duplicates["remove_type"] == "Duplicated Content")
].sort_values(["title", "date_modified", "extracted_content_body", "page_views"])

display(df_duplicated_content)  # ruff: noqa: F821

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
2365,1473807,sleep-test-2,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2367,1473810,sleep-test-2-change-css,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2370,1473813,sleep-test-3-css-change-again,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2371,1473816,sleep-test-dup,3 - 6 Years: Good Sleep Captain Sleep,https://www.healthhub.sg/programmes/parent-hub...,"b'<div class=""ExternalClassC9333E76130B4280BE4...",INSERT YOUR CHILDS NAME HERE\nMAX 15 CHAR\n\nP...,,,program-sub-pages,0,True,Duplicated Content
2520,1434652,3 Be's To Beat Diabetes | Diabetes Hub,3 Be's To Beat Diabetes | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-hub,"b'<div class=""ExternalClassFAC31D10071B445C93D...",3 BES TO BEAT DIABETES \n1 Be Aware\nThe more ...,Ministry of Health,,programs,20626,True,Duplicated Content
...,...,...,...,...,...,...,...,...,...,...,...,...
2458,1435245,Persona B_persona-b_Level1,Persona B,https://www.healthhub.sg/programmes/ga-testing...,"b'<div class=""ExternalClassEC86420A61A64D089B7...",a-,Health Promotion Board,,program-sub-pages,0,True,Duplicated Content
2481,1435160,Setting goals | Diabetes Hub_setting-goals_Level1,Setting goals | Diabetes Hub,https://www.healthhub.sg/programmes/diabetes-h...,"b'<div class=""ExternalClass1EE8A09B31F6418C97F...",3 BES TO BEAT DIABETES \nOn this page\n1. Over...,Ministry of Health,,program-sub-pages,0,True,Duplicated Content
2423,1434979,Take the first step with your loved ones_healt...,Take the first step with your loved ones,https://www.healthhub.sg/programmes/indian_out...,"b'<div class=""ExternalClass25A0E1C4C944475AA3B...",- Menu\n- Home\n- Healthy Eating\n- Physical A...,Health Promotion Board,,program-sub-pages,0,True,Duplicated Content
2304,1434977,Take the first step with your loved ones_physi...,Take the first step with your loved ones,https://www.healthhub.sg/programmes/howareyoud...,"b'<div class=""ExternalClass37ECDF16831C42DFB97...",- Menu\n- Home\n- Healthy Eating\n- Physical A...,Health Promotion Board,,program-sub-pages,519,True,Duplicated Content


In [11]:
df_duplicates = flag_duplicated(df_keep.copy(), [], column="full_url")
df_duplicated_url = df_duplicates[(df_duplicates["remove_type"] == "Duplicated URL")].sort_values(
    ["full_url", "date_modified"]
)

display(df_duplicated_url)

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
1067,1445629,Sliced Fish with Bee Hoon Soup,Sliced Fish with Bee Hoon Soup,https://www.healthhub.sg/live-healthy/fish-bee...,"b'<div class=""ExternalClassF5C1DD3FA7E84963A88...",Mouthwatering sliced fish with bee hoon soup\n...,Health Promotion Board,2022-11-15T08:35:26.0000000Z,live-healthy-articles,378,True,Duplicated URL
1066,1445828,Recipe : Sliced Fish and Bee Hoon Soup,Recipe : Sliced Fish and Bee Hoon Soup,https://www.healthhub.sg/live-healthy/fish-bee...,"b'<div class=""ExternalClassB39C037E02BC47C6A3C...",Looking to bring some wholesome goodness home?...,Health Promotion Board,2022-11-15T08:35:31.0000000Z,live-healthy-articles,378,True,Duplicated URL
620,1446081,Outdoor Activities for Kids,Outdoor Activities for Kids,https://www.healthhub.sg/live-healthy/ideas-fo...,"b'<div class=""ExternalClass196D7C5AC7594C8E8BC...",The lack of outdoor activity among children co...,Health Promotion Board,2022-11-15T08:51:15.0000000Z,live-healthy-articles,1179,True,Duplicated URL
621,1444496,Weekend Activities: 5 Ideas for Families,Outdoor Activities for Your Children,https://www.healthhub.sg/live-healthy/ideas-fo...,"b'<div class=""ExternalClass196D7C5AC7594C8E8BC...",The lack of insufficient outdoor activity amon...,Health Promotion Board,2023-03-23T08:09:13.0000000Z,live-healthy-articles,1179,True,Duplicated URL
1215,1443608,Mee Goreng,Mee Goreng,https://www.healthhub.sg/live-healthy/mee-goreng,"b'<div class=""ExternalClass60865CF1F8FA4603ABA...",By KK Womens and Childrens Hospital and Ms Hen...,KK Women's and Children's Hospital,2021-12-21T08:10:47.0000000Z,live-healthy-articles,276,True,Duplicated URL
1212,1445798,Recipe : Mee Goreng,Recipe : Mee Goreng,https://www.healthhub.sg/live-healthy/mee-goreng,"b'<div class=""ExternalClass875BB3A3A2F84BE4A49...",Looking to bring some wholesome goodness home?...,Health Promotion Board,2022-11-15T08:35:25.0000000Z,live-healthy-articles,276,True,Duplicated URL
1370,1445829,Recipe : Sayur Lodeh,Recipe : Sayur Lodeh,https://www.healthhub.sg/live-healthy/sayur-lodeh,"b'<div class=""ExternalClass8FB2510AE8C541EEADF...",Looking to bring some wholesome goodness home?...,Health Promotion Board,2022-11-15T08:35:29.0000000Z,live-healthy-articles,187,True,Duplicated URL
1369,1444751,Cooking with Chef Mel Dean... Sayur Lodeh,Cooking with Chef Mel Dean... Sayur Lodeh,https://www.healthhub.sg/live-healthy/sayur-lodeh,"b'<div class=""ExternalClassF0D563DE92F04D3C9F6...","Sayur Lodeh is everyone's favourite dish, espe...",Health Promotion Board,2022-11-15T08:35:36.0000000Z,live-healthy-articles,187,True,Duplicated URL
657,1444417,The A-Z Guide to Healthy Lifestyle Activities,The A-Z Guide to Healthy Lifestyle Activities,https://www.healthhub.sg/live-healthy/the-a-z-...,"b'<div class=""ExternalClassBA1FDDD033444100B98...","Physical activity is key to staying healthy, b...",Health Promotion Board,2022-11-15T08:41:41.0000000Z,live-healthy-articles,1047,True,Duplicated URL
656,1446090,The A-Z Guide To A Healthy And Active Lifestyle,The A-Z Guide To A Healthy And Active Lifestyle,https://www.healthhub.sg/live-healthy/the-a-z-...,"b'<div class=""ExternalClassBA1FDDD033444100B98...","Physical activity is key to staying healthy, b...",Health Promotion Board,2023-04-04T03:01:38.0000000Z,live-healthy-articles,1047,True,Duplicated URL


In [12]:
df_duplicates = flag_duplicated(df_keep.copy(), whitelist, column="full_url")
df_duplicated_url = df_duplicates[(df_duplicates["remove_type"] == "Duplicated URL")].sort_values(
    ["full_url", "date_modified"]
)

display(df_duplicated_url)  # ruff: noqa: F821

Unnamed: 0,id,content_name,title,full_url,content_body,extracted_content_body,pr_name,date_modified,content_category,page_views,to_remove,remove_type
1067,1445629,Sliced Fish with Bee Hoon Soup,Sliced Fish with Bee Hoon Soup,https://www.healthhub.sg/live-healthy/fish-bee...,"b'<div class=""ExternalClassF5C1DD3FA7E84963A88...",Mouthwatering sliced fish with bee hoon soup\n...,Health Promotion Board,2022-11-15T08:35:26.0000000Z,live-healthy-articles,378,True,Duplicated URL
1215,1443608,Mee Goreng,Mee Goreng,https://www.healthhub.sg/live-healthy/mee-goreng,"b'<div class=""ExternalClass60865CF1F8FA4603ABA...",By KK Womens and Childrens Hospital and Ms Hen...,KK Women's and Children's Hospital,2021-12-21T08:10:47.0000000Z,live-healthy-articles,276,True,Duplicated URL
1370,1445829,Recipe : Sayur Lodeh,Recipe : Sayur Lodeh,https://www.healthhub.sg/live-healthy/sayur-lodeh,"b'<div class=""ExternalClass8FB2510AE8C541EEADF...",Looking to bring some wholesome goodness home?...,Health Promotion Board,2022-11-15T08:35:29.0000000Z,live-healthy-articles,187,True,Duplicated URL
1369,1444751,Cooking with Chef Mel Dean... Sayur Lodeh,Cooking with Chef Mel Dean... Sayur Lodeh,https://www.healthhub.sg/live-healthy/sayur-lodeh,"b'<div class=""ExternalClassF0D563DE92F04D3C9F6...","Sayur Lodeh is everyone's favourite dish, espe...",Health Promotion Board,2022-11-15T08:35:36.0000000Z,live-healthy-articles,187,True,Duplicated URL
657,1444417,The A-Z Guide to Healthy Lifestyle Activities,The A-Z Guide to Healthy Lifestyle Activities,https://www.healthhub.sg/live-healthy/the-a-z-...,"b'<div class=""ExternalClassBA1FDDD033444100B98...","Physical activity is key to staying healthy, b...",Health Promotion Board,2022-11-15T08:41:41.0000000Z,live-healthy-articles,1047,True,Duplicated URL


In [13]:
list(df_duplicated_url["full_url"])


[1m[[0m
    [32m'https://www.healthhub.sg/live-healthy/fish-bee-hoon-soup'[0m,
    [32m'https://www.healthhub.sg/live-healthy/mee-goreng'[0m,
    [32m'https://www.healthhub.sg/live-healthy/sayur-lodeh'[0m,
    [32m'https://www.healthhub.sg/live-healthy/sayur-lodeh'[0m,
    [32m'https://www.healthhub.sg/live-healthy/the-a-z-guide-to-an-active-lifestyle'[0m
[1m][0m