# Monitoring the quality of the extracted text


## Import relevant libraries


In [None]:
import math

import numpy as np
import plotly.express as px
from bs4 import BeautifulSoup

## Functions to check for nested divs


In [None]:
def check_nested_divs_recursive(element):
    is_nested_div = False

    # Check for nested_divs by comparing child and parent name via recursion
    if element.name == "div" and element.parent.name == "div":
        is_nested_div = True

    # Find all elements
    direct_children = element.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    )
    for child in direct_children:
        check_nested_div = check_nested_divs_recursive(child)
        is_nested_div = is_nested_div or check_nested_div

    return is_nested_div

In [None]:
def check_for_nested_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    has_nested_div = False
    for tag in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    ):
        check_nested_div = check_nested_divs_recursive(tag)
        has_nested_div = has_nested_div or check_nested_div

    # print(f"\nHas Nested div: {has_nested_div}")

    return has_nested_div

In [None]:
def count_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    count = len(soup.find_all("div", recursive=True))

    return count

## Load Merged Data from Kedro Catalog


In [None]:
%load_ext kedro.ipython

In [None]:
# ruff: noqa: F821
catalog.list()

In [None]:
merged_data = catalog.load("merged_data")

display(merged_data)

## Keep only rows where `to_remove` is False


In [None]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)

## Keep only relevant Content Categories


In [None]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)

## Keep Relevant Columns


In [None]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "full_url",
        "title",
        "content_category",
        "content_body",
        "extracted_content_body",
    ]
]

display(df_extracted)

In [None]:
print(df_extracted.groupby(["content_category"])["id"].count())

## Count the number of div tags in article


In [None]:
df_extracted["div_count"] = df_extracted.apply(
    lambda row: count_divs(row["content_body"]), axis=1
)

In [None]:
display(df_extracted)

In [None]:
div_count_threshold = 5

fig = px.histogram(df_extracted, x="div_count", nbins=40)
fig.update_layout(
    title_text="Article div count distribution",
    xaxis_title_text="Div count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.add_vline(x=div_count_threshold, line_dash="dash", line_color="firebrick")
# fig.add_hline(y=100, line_dash="dash", line_color="red")
fig.show()

In [None]:
div_counts_by_category = (
    df_extracted[df_extracted["div_count"] >= div_count_threshold]
    .groupby(["content_category"])["div_count"]
    .count()
)

print(div_counts_by_category)

## Check for Nested divs in article


In [None]:
df_extracted["has_nested_div"] = df_extracted.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

display(df_extracted)

In [None]:
print(df_extracted["has_nested_div"].value_counts())

In [None]:
df_extracted[
    (df_extracted["div_count"] <= div_count_threshold)
    & (df_extracted["has_nested_div"])
]

In [None]:
nested_divs_by_category = (
    df_extracted[
        (df_extracted["div_count"] <= div_count_threshold)
        & (df_extracted["has_nested_div"])
    ]
    .groupby(["content_category"])["has_nested_div"]
    .count()
)

print(nested_divs_by_category)

## Flag articles by div_count and has_nested_divs


In [None]:
def flag_articles(df, threshold):
    if "flagged" not in df.columns:
        df["flagged"] = False
        df["type"] = None

    for ind, row in df.iterrows():
        if row["div_count"] >= threshold:
            df.at[ind, "flagged"] = True
            df.at[ind, "type"] = "div count exceeds threshold"

        if row["has_nested_div"]:
            df.at[ind, "flagged"] = True
            if df.at[ind, "type"] is None:
                df.at[ind, "type"] = "has nested div"
            else:
                df.at[ind, "type"] = df.at[ind, "type"] + ", " + "has nested div"

    return df

In [None]:
df_flagged = flag_articles(df_extracted, div_count_threshold).sort_values(
    by="div_count", ascending=False
)

In [None]:
display(df_flagged[df_flagged["flagged"]])

In [None]:
flagged_by_category = (
    df_flagged[df_flagged["flagged"]].groupby(["content_category"])["flagged"].count()
)

print(flagged_by_category)

In [None]:
# df_flagged.to_excel("flagged_articles.xlsx")

## Inspecting for Poor Text Extraction


### Using the Word Count of the Largest Paragraph in each Article as a Heuristic


In [None]:
def max_paragraph_size(row):
    article = row["extracted_content_body"]
    if article is None:
        return 0

    # print(type(article))
    paragraphs = article.split("\n")
    if paragraphs is None:
        return 0

    max_paragraph_size = 0
    for paragraph in paragraphs:
        paragraph_size = len(paragraph.split())
        max_paragraph_size = max(max_paragraph_size, paragraph_size)

    return max_paragraph_size

In [None]:
df_extracted["max_paragraph_size"] = df_extracted.apply(max_paragraph_size, axis=1)

In [None]:
display(df_extracted)

### Visualising the `Max Paragraph Size` of each article (Log-normal)


In [None]:
fig = px.histogram(df_extracted, x="max_paragraph_size", nbins=100)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

#### Inspecting the tails of the distribution


In [None]:
print(
    df_extracted["max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

### Visualising the Normal Distribution using `log_max_paragraph_size`


In [None]:
df_extracted["log_max_paragraph_size"] = np.log(df_extracted["max_paragraph_size"])

#### Setting the Lower & Upper Threshold

In [None]:
import math

In [None]:
lower_threshold = np.log(
    math.ceil(df_extracted["max_paragraph_size"].quantile([0.025]).values[0] / 10) * 10
)
print(lower_threshold)

In [None]:
upper_threshold = np.log(
    math.floor(df_extracted["max_paragraph_size"].quantile([0.975]).values[0] / 10) * 10
)
print(upper_threshold)

In [None]:
fig = px.histogram(df_extracted, x="log_max_paragraph_size", nbins=100)
fig.update_layout(
    title_text=f"log(word_count) distribution for Paragraph: {upper_threshold}",
    xaxis_title_text="log(word_count)",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
# Presentation of Percentile for 0, 1, 2, 3 std dev from mean
print(
    df_extracted["log_max_paragraph_size"].quantile(
        [0.001, 0.021, 0.136, 0.5, 0.841, 0.977, 0.999]
    )
)

#### Adding the `percentile` rank for each record


In [None]:
df_extracted["percentile_rank"] = df_extracted["log_max_paragraph_size"].rank(
    pct=True, ascending=True
)

In [None]:
display(df_extracted)

### Inspection articles higher than the upper threshold


In [None]:
df_inspect_higher = df_extracted[
    df_extracted["log_max_paragraph_size"] > upper_threshold
]
df_inspect_higher.sort_values(by="max_paragraph_size", ascending=False, inplace=True)

print(df_inspect_higher.shape)

In [None]:
display(df_inspect_higher)

In [None]:
fig = px.histogram(df_inspect_higher, x="max_paragraph_size", nbins=50)
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

In [None]:
# Export file to Excel

# df_inspect_higher.to_excel("inspection.xlsx")

#### Remove articles with nested div containers


In [None]:
df_inspect_higher["has_nested_div"].value_counts()

In [None]:
df_inspect_higher["has_nested_div"] = df_inspect_higher.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

df_no_nested_divs = df_inspect_higher[~df_inspect_higher["has_nested_div"]]
display(
    df_no_nested_divs.sort_values(
        by=["max_paragraph_size"], ascending=False, inplace=False
    )
)

In [None]:
query = df_no_nested_divs
extracted_text = query.iloc[0]["extracted_content_body"]

print(extracted_text)

In [None]:
raw_html = query.iloc[0]["content_body"]

print(raw_html)

### Inspection articles below the lower threshold


In [None]:
df_inspect_lower = df_extracted[
    df_extracted["log_max_paragraph_size"] < lower_threshold
]
df_inspect_lower.sort_values(by="max_paragraph_size", ascending=True, inplace=True)

print(df_inspect_lower.shape)

In [None]:
display(df_inspect_lower)

In [None]:
extracted_text = df_inspect_lower.iloc[0]["extracted_content_body"]

print(extracted_text)

In [None]:
raw_html = df_inspect_lower.iloc[0]["content_body"]

print(raw_html)