In [None]:
import math
import re
import string
from functools import reduce
from typing import Union

import pandas as pd

In [None]:
%load_ext kedro.ipython

In [None]:
merged_data = pd.read_parquet("../data/merged_data.parquet")

display(merged_data)

In [None]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)

In [None]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)

In [None]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "title",
        "article_category_names",
        "full_url",
        "friendly_url",
        "category_description",
        "content_category",
        "content_body",
        "pr_name",
        "has_table",
        "has_image",
        "related_sections",
        "extracted_tables",
        "extracted_links",
        "extracted_headers",
        "extracted_img_alt_text",
        "extracted_content_body",
    ]
]

display(df_extracted)

In [None]:
def hemmingway_score(text: str) -> dict[str, Union[str, int]]:
    """
    Calculates the hemmingway score of the given text.

    Args:
        text (str): The text to be scored.

    Returns:
        tuple(int, str): Returns a tuple containing the hemmingway score and difficulty of the text
    """
    sentences = []
    words = []

    # Compile regex to detect punctuations
    regex = re.compile("[%s]" % re.escape(string.punctuation))

    # Remove all hyperlinks
    filtered_text = re.sub(r"https?:\/\/[^\s]+", "", text)

    # Split the extracted text by the newline delimiter
    lines = filtered_text.split("\n")
    # Track the sentences in the text
    for line in lines:
        partial_sentences = re.split(r"[.!?]", line)
        for sentence in partial_sentences:
            sentences.append(sentence.strip())

    # Track the words in the text
    for sentence in sentences:
        sentence_words = sentence.split(" ")
        for word in sentence_words:
            word = regex.sub("", word.strip())
            words.append(word)

    # Filter for empty strings
    filtered_sentences = list(filter(lambda x: len(x) > 0, sentences))
    filtered_words = list(filter(lambda x: len(x) > 0, words))

    # Count the number of sentences, words and letters
    num_sentences = len(filtered_sentences)
    num_words = len(filtered_words)
    num_letters = reduce(lambda x, y: x + y, map(len, words))

    # Calculate the Hemmingway Score
    score = math.ceil(
        4.71 * (num_letters / num_words) + 0.5 * (num_words / num_sentences) - 21.43
    )

    # Get the reading level of the text based on the calculated score
    if score < 10:
        level = "normal"
    elif 10 <= score < 14:
        level = "hard"
    else:
        level = "very hard"

    return {"score": score, "level": level}

In [None]:
def calculate_score(text):
    metrics = hemmingway_score(text)
    return metrics["score"]


def calculate_level(text):
    metrics = hemmingway_score(text)
    return metrics["level"]


df_extracted["readability_score"] = df_extracted["extracted_content_body"].apply(
    calculate_score
)
df_extracted["reading_level"] = df_extracted["extracted_content_body"].apply(
    calculate_level
)
display(df_extracted)

In [None]:
print(
    df_extracted.groupby(["content_category"])["id"]
    .count()
    .sort_values(ascending=False)
)

In [None]:
print(df_extracted.groupby(["pr_name"])["id"].count().sort_values(ascending=False))

In [None]:
print(
    df_extracted.groupby(["article_category_names"])["id"]
    .count()
    .sort_values(ascending=False)
)

In [None]:
print(df_extracted.groupby(["title"])["id"].count().sort_values(ascending=False))

In [None]:
print(df_extracted.groupby(["content_name"])["id"].count().sort_values(ascending=False))

In [None]:
print(
    df_extracted.groupby(["reading_level"])["id"].count().sort_values(ascending=False)
)

In [None]:
df_hpb = df_extracted[df_extracted["pr_name"] == "Health Promotion Board"]

display(df_hpb)

In [None]:
print(df_hpb.groupby(["reading_level"])["id"].count().sort_values(ascending=False))