## Import Packages


In [None]:
# Install required packages

# !pip install openpyxl beautifulsoup4 lxml html5lib
# !pip install htmlmin

In [None]:
# Import relevant packages

import json
import os
import re
import zipfile
from io import StringIO
from unicodedata import normalize

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# import htmlmin
from markdownify import markdownify

## Get .xlsx files from ZIP archive


### Utility Functions


In [None]:
# Create zip archive


def create_archive(zip_path):
    archive = zipfile.ZipFile(zip_path, "r")
    return archive

In [None]:
# Get xlsx files from archive


def get_xlsx_from_archive(archive):
    files_and_dirs = archive.namelist()
    xlsx_files = list(filter(lambda k: k.split(".")[-1] == "xlsx", files_and_dirs))
    return xlsx_files

In [None]:
# Get file name


def get_fname(fpath):
    fname = fpath.split(".")[0].split("/")[-1]
    return fname

### Procedure


In [None]:
# Open zip file and extract directories and files


archive = create_archive("../data/GenAI - Full Content Export.zip")
files_and_dirs = archive.namelist()

print(files_and_dirs)

In [None]:
# Get xlsx files from archive

archive = create_archive("../data/GenAI - Full Content Export.zip")
xlsx_files = get_xlsx_from_archive(archive)

print(xlsx_files)

In [None]:
# Get file names from archive

for xlsx in xlsx_files:
    fname = get_fname(xlsx)
    print(fname)

## Naive Exploratory Data Analysis (Single .xlsx file)


In [None]:
file = archive.open(xlsx_files[0])

### Display .xlsx as Pandas Dataframe


In [None]:
# Load excel file to dataframe

df = pd.read_excel(file)

# ruff: noqa: F821
display(df)

In [None]:
# Display information on dataframe

df.info()

### Filter Columns where all values are NaN


In [None]:
# Drop columns where all values are NaN (irrelevant columns)

df_filtered = df.dropna(axis="columns", how="all")

# ruff: noqa: F821
display(df_filtered)

In [None]:
# df_filtered.to_parquet('./export-published-cost-and-financing_14062024_data.parquet')

In [None]:
# Get columns present

print(df_filtered.columns)

### Find ContentBody Column in DataFrame


In [None]:
col = df_filtered.columns[df_filtered.columns.str.contains("ContentBody")][0]
print(col)

In [None]:
raw_html = df_filtered[col]

# ruff: noqa: F821
display(raw_html.head(15))

## Text Extraction


In [None]:
# Extract HTML sample

sample = raw_html[0]

print(sample)

In [None]:
print(sample is np.nan)

### Method 1: Scraping ContentBody using BeautifulSoup


In [None]:
soup = BeautifulSoup(sample, "lxml")

# Some ContentBody values are wrapped with a div class HTML element
if soup.div is not None:
    soup.div.unwrap()

In [None]:
clean_text = soup.get_text()
print(clean_text)

In [None]:
clean_text = (
    normalize("NFKC", clean_text).replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
)

# clean_text = clean_text.split("\n")
texts = clean_text.split("\n")

for i in range(len(texts)):
    texts[i] = texts[i].strip()

print(texts)

In [None]:
clean_text = "\n".join(texts).strip()
print(clean_text)

### Method 2: Scraping ContentBody using Regular Expressions


In [None]:
CLEANR = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
# CLEANR = re.compile('<p.*?>|</p>|</div>|<div.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')


def clean_html(raw_html):
    cleantext = re.sub(CLEANR, " ", raw_html)
    return cleantext


clean_text = clean_html(sample)
print(clean_text)

In [None]:
clean_text = (
    normalize("NFKC", clean_text).replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
)

# clean_text = clean_text.split("\n")
texts = clean_text.split("\n")

for i in range(len(texts)):
    texts[i] = texts[i].strip()

print(texts)

In [None]:
clean_text = "\n".join(texts).strip().replace("\n\n\n", "\n").replace("\n\n\n", "\n\n")
print(clean_text)

### Method 3: Scraping ContentBody using Custom Functions


In [None]:
soup = BeautifulSoup(sample, "lxml")

print(soup)

#### Version 1 - Strip text + Introduce formatting


In [None]:
# Works very well but prone to duplicated text for nested tags. Look into resolving non-ascii representation of characters
# TODO: Implement unicode normalisation when parsing text from HTML fragments


def parse_text(soup):
    def process_links(tag):
        links = []

        for a in tag.find_all("a"):
            title = a.get("title") or a.text.strip()
            url = a.get("href")
            links.append(f"{title}: {url}")

        return links

    def process_table(table):
        if (
            table.find_all("tr") == []
        ):  # Empty table in All You Need to Know About Childhood Immunisations
            return ""

        headers = [header.get_text(strip=True) for header in table.find_all("tr")[0]]
        headers = list(filter(lambda k: " " in k, headers))
        rows = []

        for row in table.find_all("tr")[1:]:
            cols = row.find_all("td")
            cols = [ele.get_text(strip=True).replace("\xa0", " ") for ele in cols]
            rows.append(cols)

        table_text = []

        if headers:
            table_text.append(" | ".join(headers))

        for row in rows:
            table_text.append(" | ".join(row))

        return "\n".join(table_text)

    if soup.div is not None:
        soup.div.unwrap()

    # TODO: Implement unicode normalisation when parsing text from HTML fragments
    organized_text = []

    for elem in soup.find_all(
        ["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "table", "ol"]
    ):
        if elem.name == "p":
            organized_text.append(elem.text.strip())

        elif elem.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            organized_text.append(f"\n\n{elem.text.strip()}\n")

        elif elem.name == "div":
            organized_text.append(elem.text.strip())

        elif elem.name == "ul":
            for li in elem.find_all("li"):
                organized_text.append(f"  - {li.text.strip()}")

        elif elem.name == "ol":
            for i, li in enumerate(elem.find_all("li"), 1):
                organized_text.append(f"  {i}. {li.text.strip()}")

        elif elem.name == "table":
            organized_text.append(process_table(elem))

        links = process_links(elem)
        if links:
            organized_text.append("\n")
            organized_text.extend(links)
            organized_text.append("\n")

    # print(organized_text)
    return (
        "\n".join(organized_text)
        .replace("\n\n", "\n")
        .replace("\n         ", "")
        .replace("\n      ", "")
        .replace("\u200b", "")
        .strip()
    )


print(parse_text(soup))

#### Version 2 - Introduce Unicode Normalisation + Encode to ASCII + Decode back to UTF-8


In [None]:
# Works very well but prone to duplicated text for nested tags. Look into resolving non-ascii representation of characters
# TODO: Implement unicode normalisation when parsing text from HTML fragments


def clean_text(text):
    clean_text = (
        normalize("NFKC", text).encode("ascii", "ignore").decode("utf8").strip()
    )

    return clean_text


def process_links(tag):
    links = []

    for a in tag.find_all("a"):
        title = a.get("title") or a.text.strip()
        url = a.get("href")
        links.append(f"{title}: {url}")

    return links


def process_table(table):
    if (
        table.find_all("tr") == []
    ):  # Empty table in All You Need to Know About Childhood Immunisations
        return ""

    headers = [clean_text(header.get_text()) for header in table.find_all("tr")[0]]
    headers = list(filter(lambda k: " " in k, headers))
    rows = []

    for row in table.find_all("tr")[1:]:
        cols = row.find_all("td")
        cols = [clean_text(ele.get_text()) for ele in cols]
        rows.append(cols)

    table_text = []

    if headers:
        table_text.append(" | ".join(headers))

    for row in rows:
        table_text.append(" | ".join(row))

    return "\n".join(table_text)


def parse_text(soup):
    if soup.div is not None:
        soup.div.unwrap()

    # TODO: Implement unicode normalisation when parsing text from HTML fragments
    organized_text = []

    for elem in soup.find_all(
        ["p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "table", "ol"]
    ):
        text = clean_text(elem.text)
        if elem.name in ["p", "div"]:
            organized_text.append(text)

        elif elem.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            organized_text.append(f"\n\n{text}\n")

        elif elem.name == "ul":
            for li in elem.find_all("li"):
                li_text = clean_text(li.text)
                organized_text.append(f"  - {li_text}")

        elif elem.name == "ol":
            for i, li in enumerate(elem.find_all("li"), 1):
                li_text = clean_text(li.text)
                organized_text.append(f"  {i}. {li_text}")

        elif elem.name == "table":
            organized_text.append(process_table(elem))

        links = process_links(elem)
        if links:
            organized_text.append("\n\n")
            organized_text.extend(links)
            organized_text.append("\n\n")

    # print(organized_text)
    # return "\n".join(organized_text).strip()
    return (
        "\n".join(organized_text)
        .replace("\n\n\n", "\n")
        .replace("\n            ", "")
        .replace("\n      ", "")
        .strip()
    )


print(parse_text(soup))

In [None]:
# Extract tables - Experimental


# Notes: Need to check if table exists
def extract_table(sample):
    html_file = StringIO(sample)

    tables = pd.read_html(html_file, header=0)
    stores = []

    for i in range(len(tables)):
        json_str = tables[0].to_json(index=False, orient="records")
        store = json.loads(json_str)

        for i in range(len(store)):
            ele = store[i]
            for key, value in ele.items():
                ele[key] = normalize("NFKC", str(value).replace("\u200b", ""))
            store[i] = ele

        stores.append(store)

    if not stores:
        return None

    return stores


# print(extract_table(sample))

In [None]:
# minified = htmlmin.minify(sample, remove_empty_space=True)
# print(minified)

In [None]:
# Parse HTML via BeautifulSoup

soup = BeautifulSoup(sample, "lxml")

In [None]:
def extract_tags(soup):
    tags = set()

    for tag in soup.find_all(True):
        tags.add(tag.name)

    return list(tags)


print(extract_tags(soup))

In [None]:
# Display all headers from article}


def extract_headers(soup):
    titles = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])

    headers = []
    for title in titles:
        tag = title.name
        text = title.get_text()
        records = text, tag
        headers.append(records)

    # print('List all the header tags :', *titles, sep='\n\n', end="\n\n")
    return headers


print(extract_headers(soup))

In [None]:
def extract_urls_list(soup):
    url_records = []

    for link in soup.find_all("a"):
        url = link.get("href")
        title = link.get("title")
        text = link.get_text()

        if title is not None:
            records = title, url
        else:
            records = text, url

        url_records.append(records)

    if not url_records:
        return None

    return url_records


print(extract_urls_list(soup))

In [None]:
def extract_urls_dict(soup):
    urls_dict = dict()
    for link in soup.find_all("a"):
        url = link.get("href")
        title = link.get("title")
        text = link.get_text()

        if url not in urls_dict:
            if title is not None:
                urls_dict[url] = title
            else:
                urls_dict[url] = text

    res = dict((v, k) for k, v in urls_dict.items())

    if not res:
        return None

    return res


print(extract_urls_dict(soup))

In [None]:
def extract_info(sample):
    if sample is np.nan:
        return

    soup = BeautifulSoup(sample, "lxml")

    tags = extract_tags(soup)
    headers = extract_headers(soup)
    urls = extract_urls_list(soup)

    tables = None
    if soup.find_all("table") != []:
        try:
            tables = extract_table(sample)
            # tables_str = json.dumps(tables)
        except Exception as e:
            print(e)

    text = parse_text(soup)
    return tags, headers, urls, tables, text


print(extract_info(sample))

In [None]:
def write_to_txt(dir_path, fname, text):
    fpath = f"{dir_path}/{fname}.txt"

    try:
        os.makedirs(dir_path, exist_ok=True)
        with open(fpath, "w") as f:
            f.write(text)
    except OSError as error:
        print(error)

In [None]:
def process_xlsx(archive, xlsx_file, dir_path):
    dname = xlsx_file.split(".")[0].split("/")[-1]
    output_dir = f"{dir_path}/{dname}"
    print("\n\n", output_dir)

    file = archive.open(xlsx_file)
    df = pd.read_excel(file)
    df_processed = df.dropna(axis="columns", how="all")

    # "Content_Body" instead of "ContentBody" in export-published-live-healthy-articles_14062024_data.xlsx
    # "Content_x0020_Body" instead of "ContentBody" in export-published-programs_14062024_data.xlsx
    col = df_processed.columns[
        df_processed.columns.str.contains("ContentBody")
        | df_processed.columns.str.contains("Content_Body")
        | df_processed.columns.str.contains("Content_x0020_Body")
    ][0]
    article_names = df_processed["Content.Name"]
    raw_htmls = df_processed[col]

    for i in range(n := len(raw_htmls)):
        sample = raw_htmls[i]
        # Some articles uses slashes for medications
        fname = article_names[i].replace("/", "-")
        print(fname)

        if sample is not np.nan:
            tags, headers, urls, tables, text = extract_info(sample)
            write_to_txt(output_dir, fname, text)

In [None]:
# Function to extract xlsx files from archive


def create_archive(zip_path):
    archive = zipfile.ZipFile(zip_path, "r")
    return archive


def get_xlsx_from_archive(archive):
    files_and_dirs = archive.namelist()
    xlsx_files = list(filter(lambda k: k.split(".")[-1] == "xlsx", files_and_dirs))
    return xlsx_files


print(get_xlsx_from_archive(create_archive("../data/GenAI - Full Content Export.zip")))

In [None]:
def main():
    archive_path = "../data/GenAI - Full Content Export.zip"
    output_dir_path = "../data/processed"
    archive = create_archive(archive_path)
    xlsx_files = get_xlsx_from_archive(archive)

    for i in range(len(xlsx_files)):
        process_xlsx(archive, xlsx_files[i], output_dir_path)

In [None]:
main()

# # Errors:
# invalid literal for int() with base 10: 'h2'
# [Errno 63] File name too long: '../data/processed/export-published-live-healthy-articles_14062024_data/                                                                                                                                                                                                         Books for your growing child (Toddler and Preschooler).txt'
# No tables found matching pattern '.+'
# /var/folders/3n/y5_h0fxs0bv2mhb7bf_fpmb80000gn/T/ipykernel_53004/2874476162.py:5: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.
#   soup = BeautifulSoup(sample, 'lxml')

### Method 4: Convert HTML to Markdown


In [None]:
# !pip install markdownify

In [None]:
# Convert HTML to Markdown
markdown_text = markdownify(sample)

# Display converted text
print(markdown_text.replace("\n\n\n", "\n\n").strip())

# END


---
