# Preprocessing

In [1]:
import os
import pickle
import re

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
DATA_PATH = os.path.join("..", "data", "healthhub_small")
CLEAN_DATA_PATH = os.path.join("..", "data", "healthhub_small_clean")

LOADED_CHUNK_ID_LIST_PATH = os.path.join(DATA_PATH, "healthhub_chunk_id_list_small.pkl")
LOADED_SOURCE_LIST_PATH = os.path.join(DATA_PATH, "healthhub_source_list_small.pkl")
LOADED_DOMAIN_LIST_PATH = os.path.join(DATA_PATH, "healthhub_domain_list_small.pkl")
LOADED_TITLE_LIST_PATH = os.path.join(DATA_PATH, "healthhub_title_list_small.pkl")
LOADED_CONTRIBUTOR_LIST_PATH = os.path.join(
    DATA_PATH, "healthhub_contributor_list_small.pkl"
)
LOADED_CONTENT_LIST_PATH = os.path.join(DATA_PATH, "healthhub_content_list_small.pkl")

CLEANED_CHUNK_ID_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_chunk_id_list_small_clean.pkl"
)
CLEANED_SOURCE_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_source_list_small_clean.pkl"
)
CLEANED_DOMAIN_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_domain_list_small_clean.pkl"
)
CLEANED_TITLE_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_title_list_small_clean.pkl"
)
CLEANED_CONTRIBUTOR_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_contributor_list_small_clean.pkl"
)
CLEANED_CONTENT_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_content_list_small_clean.pkl"
)
CLEANED_CATEGORY_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_category_list_small_clean.pkl"
)

In [3]:
os.makedirs(CLEAN_DATA_PATH, exist_ok=True)

## Load Metadata

In [4]:
with open(LOADED_CHUNK_ID_LIST_PATH, "rb") as file:
    loaded_chunk_id = pickle.load(file)  # list of chunk ids

with open(LOADED_SOURCE_LIST_PATH, "rb") as file:
    loaded_source = pickle.load(file)  # list of hyperlinks

with open(LOADED_DOMAIN_LIST_PATH, "rb") as file:
    loaded_domain = pickle.load(file)  # website domain

with open(LOADED_TITLE_LIST_PATH, "rb") as file:
    loaded_title = pickle.load(file)  # list of titles each chunk belongs to

with open(LOADED_CONTRIBUTOR_LIST_PATH, "rb") as file:
    loaded_contributor = pickle.load(file)  # list of contributors

with open(LOADED_CONTENT_LIST_PATH, "rb") as file:
    loaded_content = pickle.load(file)  # list of chunks of contents

## Create Dataframe

In [5]:
df = pd.DataFrame(
    {
        "chunk_id": loaded_chunk_id,
        "doc_source": loaded_source,
        "doc_domain": loaded_domain,
        "doc_title": loaded_title,
        "contributor": loaded_contributor,
        "text": loaded_content,
    }
)

print(df.shape)
df.head()

(7121, 6)


Unnamed: 0,chunk_id,doc_source,doc_domain,doc_title,contributor,text
0,web_crawl_76e2f466-d0b1-58bb-a697-84f5569cd801_1,https://www.healthhub.sg/a-z/medications/proch...,healthhub.sg,prochlorperazine,Pharmaceutical Society of Singapore,HOME\n\r\n A-Z\r\n \n A\n A\n A\nProchlorperaz...
1,web_crawl_76e2f466-d0b1-58bb-a697-84f5569cd801_2,https://www.healthhub.sg/a-z/medications/proch...,healthhub.sg,prochlorperazine,Pharmaceutical Society of Singapore,What precautions should I take?\nInform your h...
2,web_crawl_76e2f466-d0b1-58bb-a697-84f5569cd801_3,https://www.healthhub.sg/a-z/medications/proch...,healthhub.sg,prochlorperazine,Pharmaceutical Society of Singapore,Avoid drinking alcohol while taking this medic...
3,web_crawl_76e2f466-d0b1-58bb-a697-84f5569cd801_4,https://www.healthhub.sg/a-z/medications/proch...,healthhub.sg,prochlorperazine,Pharmaceutical Society of Singapore,Keep this medication away from children.\nHow ...
4,web_crawl_7c5274b5-4c19-57c2-a5b6-08fc77cf9189_1,https://www.healthhub.sg/programmes/parent-hub...,healthhub.sg,positive-parenting-programme,Health Promotion Board,HOME\n\r\n PROGRAMMES\r\n \n A\n A\n A\nParent...


## Remove Duplicated chunks (same content but different links due to capitalisation)

In [None]:
df["lowercase_doc_source"] = df.doc_source.str.lower()

# Group by the lowercased `doc_source` and find the number of unique `doc_source`
# Note: If number of unique `doc_source` > 1, then there is/are a duplicate(s)
dup_df = (
    df.groupby("lowercase_doc_source")["doc_source"].apply(set).apply(len).reset_index()
)

# For all those `doc_source` > 1, we get the unique `lowercase_doc_source`
unique_dup_doc = dup_df[dup_df["doc_source"] > 1]["lowercase_doc_source"].unique()

# Get index of data which `doc_source` does not match `lowercase_doc_source` + in `unique_dup_doc`
to_drop_indices = df[
    (df["doc_source"] != df["lowercase_doc_source"])
    & (df["lowercase_doc_source"].isin(unique_dup_doc))
].index

print(f"Number of rows to drop: {len(to_drop_indices)}")

# Filter only those rows which are not in `to_drop_indices`
df = df[~df.index.isin(to_drop_indices)].reset_index(drop=True)

print(df.shape)

print(f'Number of rows to drop: {df.duplicated(subset=["text", "doc_source"]).sum()}')

# Remove completely same text and doc_source
df = df.drop_duplicates(subset=["text", "doc_source"]).reset_index(drop=True)

print(df.shape)
df.head()

In [None]:
df["doc_source"] = df["lowercase_doc_source"]
df = df.drop("lowercase_doc_source", axis=1)

df["category"] = df["doc_source"].apply(lambda x: x.split("/")[3])

## Clean Texts

In [None]:
def clean_data(text):
    def remove_after_phrase(text, phrase):
        phrase_index = text.find(phrase)
        if phrase_index != -1:
            return text[:phrase_index]
        return text

    def remove_before_phrase(text, phrase):
        phrase_index = text.find(phrase)
        if phrase_index != -1:
            return text[phrase_index + len(phrase) :]
        return text

    # Remove text "Read these next" and after it
    text = remove_after_phrase(text, phrase="Read these next")

    # Remove text "Download the HealthHub app" and after it
    text = remove_after_phrase(text, phrase="Download the HealthHub app")

    # Remove text "close\nclose\nclose"
    text = remove_after_phrase(text, phrase="close\nclose\nclose")

    # Remove text "Related Articles" and after it
    text = remove_after_phrase(text, phrase="Related Articles")

    # Remove text before "A\n A\n A\n"
    text = remove_before_phrase(text, phrase=" A\n A\n A\n")

    # Remove "CONTRIBUTED BY ..."
    text = re.sub(r"CONTRIBUTED BY\n\r\n.*\n*", "", text)

    # Fix merged words (e.g. Breast Cancer *ScreeningBeyond* the recommended…)
    # But will split 'MediShield' -> 'Medi Shield'
    text = re.sub(r"(?<=\w)([A-Z][a-z]+)", r" \1", text)

    # Add space between symbols and words (e.g. )
    text = re.sub(r"([:.!?)])([A-Z])", r"\1 \2", text)

    # Remove \n and \r
    text = re.sub(r"[\n\r]", " ", text)

    # Remove extra whitespaces
    text = re.sub(r" {2,}", " ", text)

    return text

## Removal of stopwords

In [None]:
def remove_stopwords(text: str) -> str:
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_sentence = " ".join(
        [w for w in word_tokens if w.lower() not in stop_words]
    )
    return filtered_sentence

In [None]:
df["text"] = df["text"].apply(clean_data).apply(remove_stopwords)

## Drop Rows that have Empty Text

In [None]:
df = df[df["text"] != ""]

print(df.shape)
df.head()

In [None]:
loaded_chunk_id = df["chunk_id"].tolist()
loaded_source = df["doc_source"].tolist()
loaded_domain = df["doc_domain"].tolist()
loaded_title = df["doc_title"].tolist()
loaded_contributor = df["contributor"].tolist()
loaded_content = df["text"].tolist()
loaded_category = df["category"].tolist()

In [None]:
print(
    len(loaded_chunk_id),
    len(loaded_source),
    len(loaded_domain),
    len(loaded_title),
    len(loaded_contributor),
    len(loaded_content),
    len(loaded_category),
)

6761 6761 6761 6761 6761 6761 6761


In [None]:
with open(CLEANED_CHUNK_ID_LIST_PATH, "wb") as file:
    pickle.dump(loaded_chunk_id, file)

with open(CLEANED_SOURCE_LIST_PATH, "wb") as file:
    pickle.dump(loaded_source, file)

with open(CLEANED_DOMAIN_LIST_PATH, "wb") as file:
    pickle.dump(loaded_domain, file)

with open(CLEANED_TITLE_LIST_PATH, "wb") as file:
    pickle.dump(loaded_title, file)

with open(CLEANED_CONTRIBUTOR_LIST_PATH, "wb") as file:
    pickle.dump(loaded_contributor, file)

with open(CLEANED_CONTENT_LIST_PATH, "wb") as file:
    pickle.dump(loaded_content, file)

with open(CLEANED_CATEGORY_LIST_PATH, "wb") as file:
    pickle.dump(loaded_category, file)