In [None]:
import re

import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
# Load dataframe from `data` directory
merged_data = pd.read_parquet("../data/merged_data.parquet")

display(merged_data)  # noqa: F821

In [None]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)  # noqa: F821

In [None]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)  # noqa: F821

In [None]:
print(df_keep.columns)

In [None]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "title",
        "article_category_names",
        "full_url",
        # "friendly_url",
        # "category_description",
        "content_category",
        # "content_body",
        "pr_name",
        # "has_table",
        # "has_image",
        # "related_sections",
        # "extracted_tables",
        # "extracted_raw_html_tables",
        # "extracted_links",
        # "extracted_headers",
        # "extracted_images",
        "extracted_content_body",
        "l1_mappings",
        "l2_mappings",
    ]
]

display(df_extracted)  # noqa: F821

In [None]:
def calculate_word_count(text: str) -> int:
    # print(len(list(filter(lambda x: len(x.strip()) > 0, text.split()))))
    # print(len(re.findall(r"\w+", text)))
    # print(len(re.findall(r"\w+", text)) == len(list(filter(lambda x: len(x.strip()) > 0, text.split()))), end="\n\n")
    return len(re.findall(r"\w+", text))


df_extracted.loc[:, "word_count"] = df_extracted["extracted_content_body"].apply(
    calculate_word_count
)

display(df_extracted)  # noqa: F821

In [None]:
df_extracted.loc[:, "log_word_count"] = np.log(df_extracted["word_count"])

display(df_extracted)  # noqa: F821

In [None]:
fig = px.box(
    df_extracted,
    x="word_count",
    color="content_category",
    hover_data=df_extracted.columns,
)
fig.update_layout(
    title_text="Word Count Box Plot for Extracted Article Content",
    xaxis_title_text="Word Count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

In [None]:
fig = px.histogram(
    df_extracted,
    x="word_count",
    color="content_category",
    facet_row="content_category",
    nbins=40,
    width=1000,
    height=1280,
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.update_layout(
    title_text="Word Count distribution for Extracted Article Content",
    xaxis_title_text="Word Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.show()

In [None]:
fig = px.histogram(
    df_extracted,
    x="log_word_count",
    color="content_category",
    facet_row="content_category",
    nbins=40,
    width=1000,
    height=1280,
)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))
fig.update_layout(
    title_text="Log Word Count distribution for Extracted Article Content",
    xaxis_title_text="Log Word Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.show()

In [None]:
for category in relevant_categories:
    df_content = df_extracted[df_extracted["content_category"] == category]
    print(
        f"Distribution of Word Count for {category}: {df_content.shape[0]} articles",
        df_content["word_count"].quantile(
            [0.001, 0.021, 0.159, 0.25, 0.5, 0.75, 0.841, 0.977, 0.999]
        ),
        end="\n\n",
        sep="\n",
    )

In [None]:
df_hpb = df_extracted[df_extracted["pr_name"] == "Health Promotion Board"]

display(df_hpb)  # noqa: F821

In [None]:
for category in relevant_categories:
    df_content = df_hpb[df_hpb["content_category"] == category]
    print(
        f"Distribution of Word Count for {category}: {df_content.shape[0]} articles",
        df_content["word_count"].quantile(
            [0.001, 0.021, 0.159, 0.25, 0.5, 0.75, 0.841, 0.977, 0.999]
        ),
        end="\n\n",
        sep="\n",
    )

In [None]:
def flag_below_word_count(content_category: str, text: str) -> bool:
    word_count_threshold_dict = {
        "cost-and-financing": 267,
        "live-healthy-articles": 413,
        "diseases-and-conditions": 368,
        "medical-care-and-facilities": 202,
        "support-group-and-others": 213,
    }
    word_count = len(text.split())
    # Flag articles based on the provided threshold. Otherwise, flag articles below 300 words
    threshold = word_count_threshold_dict.get(content_category, 300)
    if word_count < threshold:
        return True
    else:
        return False


df_hpb.loc[:, "flag_below_word_count"] = df_hpb.apply(
    lambda x: flag_below_word_count(x["content_category"], x["extracted_content_body"]),
    axis=1,
)

display(df_hpb)  # noqa: F821

In [None]:
df_hpb_flagged = df_hpb[df_hpb["flag_below_word_count"]]

display(df_hpb_flagged)  # noqa: F821

In [None]:
print(df_hpb_flagged["content_category"].value_counts())