# Embeddings (Neural Network-based Contextual Embeddings)

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from alive_progress import alive_bar
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from transformers.models.bert import BertTokenizerFast

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Parameters
CONTRIBUTOR: str = "Health Promotion Board"
CATEGORY: str = "live-healthy"
MODEL_NAME: str = "all-MiniLM-L6-v2"
POOLING_STRATEGY: str = "max"

In [None]:
CLEAN_DATA_PATH = os.path.join("..", "data", "healthhub_small_clean")

CLEANED_CHUNK_ID_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_chunk_id_list_small_clean.pkl"
)
CLEANED_SOURCE_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_source_list_small_clean.pkl"
)
CLEANED_DOMAIN_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_domain_list_small_clean.pkl"
)
CLEANED_TITLE_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_title_list_small_clean.pkl"
)
CLEANED_CONTRIBUTOR_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_contributor_list_small_clean.pkl"
)
CLEANED_CONTENT_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_content_list_small_clean.pkl"
)
CLEANED_CATEGORY_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH, "healthhub_category_list_small_clean.pkl"
)
CLEANED_EMBEDDING_LIST_PATH = os.path.join(
    CLEAN_DATA_PATH,
    f"healthhub_{MODEL_NAME.replace('/','_')}_{POOLING_STRATEGY}_embeddings_small_clean.parquet",
)

## Load Metadata

In [None]:
with open(CLEANED_CHUNK_ID_LIST_PATH, "rb") as file:
    loaded_chunk_id = pickle.load(file)  # list of chunk ids

with open(CLEANED_SOURCE_LIST_PATH, "rb") as file:
    loaded_source = pickle.load(file)  # list of hyperlinks

with open(CLEANED_DOMAIN_LIST_PATH, "rb") as file:
    loaded_domain = pickle.load(file)  # website domain

with open(CLEANED_TITLE_LIST_PATH, "rb") as file:
    loaded_title = pickle.load(file)  # list of titles each chunk belongs to

with open(CLEANED_CONTRIBUTOR_LIST_PATH, "rb") as file:
    loaded_contributor = pickle.load(file)  # list of contributors

with open(CLEANED_CONTENT_LIST_PATH, "rb") as file:
    loaded_content = pickle.load(file)  # list of chunks of contents

with open(CLEANED_CATEGORY_LIST_PATH, "rb") as file:
    loaded_category = pickle.load(file)  # list of categories

## Create Dataframe

In [None]:
df = pd.DataFrame(
    {
        "chunk_id": loaded_chunk_id,
        "doc_source": loaded_source,
        "doc_domain": loaded_domain,
        "doc_title": loaded_title,
        "contributor": loaded_contributor,
        "text": loaded_content,
        "category": loaded_category,
    }
)

df = df[df["contributor"] == CONTRIBUTOR].reset_index(drop=True)
df = df[df["doc_source"].apply(lambda x: x.split("/")[3] == CATEGORY)].reset_index(
    drop=True
)

print(df.shape)
df.head()

In [None]:
print(
    len(loaded_content),
    len(loaded_chunk_id),
    len(loaded_source),
    len(loaded_domain),
    len(loaded_title),
    len(loaded_contributor),
    len(loaded_category),
)

In [None]:
df["combined_text"] = None

with alive_bar(df["doc_source"].nunique(), force_tty=True) as bar:
    for source in df["doc_source"].unique():
        combined_text = " ".join(df.query("doc_source == @source")["text"].values)
        indices = df.query("doc_source == @source").index.values
        df.loc[indices, "combined_text"] = combined_text
        bar()

df

In [None]:
# Load the tokenizer and model
sentence_transformer = SentenceTransformer(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_length = sentence_transformer.max_seq_length  # 256

In [None]:
def split_into_chunks(
    sentences: list[str], max_length: int, tokenizer: BertTokenizerFast
) -> list[str]:
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        # Tokenize the sentence
        encoded_sentence = tokenizer(sentence, return_tensors="pt")
        num_tokens = encoded_sentence["input_ids"].shape[1]

        # If adding the current sentence would exceed max_length, save the current chunk and start a new one
        if current_length + num_tokens > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

        current_chunk.append(sentence)
        current_length += num_tokens

    # Add the last chunk if any
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def pool_embeddings(embeddings: np.ndarray, strategy: str = "mean") -> np.ndarray:
    if not embeddings:
        raise ValueError("The embeddings are empty.")

    if strategy == "mean":
        return np.mean(embeddings, axis=0)
    elif strategy == "max":
        return np.max(embeddings, axis=0)
    else:
        raise ValueError(
            "Pooling strategy not recognized. The strategy must be either 'average' or 'max'."
        )

## Generate Embeddings

In [None]:
article_embeddings = []

with alive_bar(df["doc_source"].nunique(), force_tty=True) as bar:
    for source in df["doc_source"].unique():
        combined_text = df.query("doc_source == @source")["combined_text"].values[0]

        # Step 1: Split the article into sentences
        sentences = sent_tokenize(combined_text)

        # Step 2: Tokenize sentences and split into chunks of max 256 tokens
        chunks = split_into_chunks(sentences, max_length, tokenizer)

        # Step 3: Encode each chunk to get their embeddings
        chunk_embeddings = [sentence_transformer.encode(chunk) for chunk in chunks]

        # Step 4: Aggregate chunk embeddings to form a single embedding for the entire article
        article_embedding = pool_embeddings(chunk_embeddings, strategy=POOLING_STRATEGY)

        indices = df.query("doc_source == @source").index.values

        for _ in range(len(indices)):
            article_embeddings.append(article_embedding)

        bar()

In [None]:
embedding_col = f"{MODEL_NAME}_{POOLING_STRATEGY}_embeddings"

df[embedding_col] = article_embeddings
df = df[~df["doc_source"].duplicated()].reset_index(drop=True)
df["chunk_id"] = df["chunk_id"].apply(lambda x: "_".join(x.split("_")[:-1]))
df

In [None]:
table = pa.Table.from_pandas(df)
pq.write_table(table, CLEANED_EMBEDDING_LIST_PATH)