In [None]:
import numpy as np
import plotly.express as px
from bs4 import BeautifulSoup

In [None]:
def check_nested_divs_recursive(element):
    is_nested_div = False

    # Check for nested_divs by comparing child and parent name via recursion
    # print(element.name, element.parent.name)
    if element.name == "div" and element.parent.name == "div":
        is_nested_div = True

    #
    direct_children = element.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    )
    for child in direct_children:
        if child.name in ["div", "span"]:
            check_nested_div = check_nested_divs_recursive(child)
            is_nested_div = is_nested_div or check_nested_div

    return is_nested_div

In [None]:
def check_for_nested_divs(html_content):
    if html_content is None:
        return False

    soup = BeautifulSoup(html_content, "html.parser")

    has_nested_div = False
    for tag in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "div", "span"],
        recursive=False,
    ):
        check_nested_div = check_nested_divs_recursive(tag)
        has_nested_div = has_nested_div or check_nested_div

    print(f"\nHas Nested div: {has_nested_div}")

    return has_nested_div

In [None]:
%load_ext kedro.ipython

In [None]:
# ruff: noqa: F821
catalog.list()

In [None]:
merged_data = catalog.load("merged_data")

display(merged_data)

In [None]:
df_extracted = merged_data[
    [
        "id",
        "content_name",
        "title",
        "content_category",
        "content_body",
        "extracted_content_body",
    ]
]  # "title", "full_url",

display(df_extracted)

In [None]:
def max_paragraph_size(row):
    article = row["extracted_content_body"]
    if article is None:
        return 0

    # print(type(article))
    paragraphs = article.split("\n")
    if paragraphs is None:
        return 0

    max_paragraph_size = 0
    for paragraph in paragraphs:
        paragraph_size = len(paragraph.split())
        max_paragraph_size = max(max_paragraph_size, paragraph_size)

    return max_paragraph_size


df_extracted["max_paragraph_size"] = df_extracted.apply(max_paragraph_size, axis=1)

In [None]:
display(df_extracted)

In [None]:
fig = px.histogram(df_extracted, x="max_paragraph_size", nbins=100, histnorm="percent")
fig.update_layout(
    title_text="Paragraph word count distribution",
    xaxis_title_text="Word count",
    yaxis_title_text="Count",
    bargap=0.1,
)

fig.show()

In [None]:
print(df_extracted["max_paragraph_size"].quantile([0.5, 0.841, 0.95, 0.977, 0.999]))

In [None]:
df_extracted["log_max_paragraph_size"] = np.log(df_extracted["max_paragraph_size"])

In [None]:
threshold = np.log(df_extracted["max_paragraph_size"].quantile([0.95]).values[0])
print(threshold)

fig = px.histogram(
    df_extracted, x="log_max_paragraph_size", nbins=100, histnorm="percent"
)
fig.update_layout(
    title_text=f"log(word_count) distribution for Paragraph: {threshold}",
    xaxis_title_text="log(word_count)",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.add_vline(x=threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
# Presentation of Percentile for 0, 1, 2, 3 std dev from mean
print(
    df_extracted["log_max_paragraph_size"].quantile(
        [0.5, 0.8, 0.841, 0.95, 0.977, 0.999]
    )
)

In [None]:
df_extracted["percentile_rank"] = df_extracted["log_max_paragraph_size"].rank(
    pct=True, ascending=True
)

In [None]:
display(df_extracted)

In [None]:
df_inspect = df_extracted[df_extracted["log_max_paragraph_size"] > threshold]
df_inspect.sort_values(by="max_paragraph_size", ascending=False, inplace=True)

In [None]:
display(df_inspect)

In [None]:
df_extracted["has_nested_div"] = df_extracted.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

display(df_extracted[df_extracted["has_nested_div"]])

In [None]:
display(df_extracted[~df_extracted["has_nested_div"]])

In [None]:
df_extracted["has_nested_div"].value_counts()

In [None]:
df_inspect["has_nested_div"] = df_inspect.apply(
    lambda row: check_for_nested_divs(row["content_body"]), axis=1
)

false_check = df_inspect[~df_inspect["has_nested_div"]]
display(
    false_check.sort_values(by=["max_paragraph_size"], ascending=False, inplace=False)
)

In [None]:
df_inspect["has_nested_div"].value_counts()

In [None]:
query = false_check
sample = query.iloc[0]["content_body"]
# print(sample)

check_for_nested_divs(sample)