## Import Relevant Libraries

In [None]:
import math
import re
import string

import numpy as np
import pandas as pd
import plotly.express as px
import tiktoken

## Load Dataframe

In [None]:
# Load dataframe from `data` directory
merged_data = pd.read_parquet("../data/merged_data.parquet")

display(merged_data)  # noqa: F821

In [None]:
df_keep = merged_data[~merged_data["to_remove"]]


display(df_keep)  # noqa: F821

In [None]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)  # noqa: F821

In [None]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "title",
        "article_category_names",
        "full_url",
        "friendly_url",
        "category_description",
        "content_category",
        "content_body",
        "pr_name",
        # "has_table",
        # "has_image",
        # "related_sections",
        # "extracted_tables",
        # "extracted_links",
        # "extracted_headers",
        # "extracted_img_alt_text",
        "extracted_content_body",
    ]
]

display(df_extracted)  # noqa: F821

## Calculate total number of Tokens using `tiktoken`

In [None]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def calculate_word_count(text):
    sentences = []
    words = []

    # Compile regex to detect punctuations
    regex = re.compile("[%s]" % re.escape(string.punctuation))

    # Remove all hyperlinks
    filtered_text = re.sub(r"https?:\/\/[^\s]+", "", text)

    # Split the extracted text by the newline delimiter
    lines = filtered_text.split("\n")
    # Track the sentences in the text
    for line in lines:
        partial_sentences = re.split(r"[.!?]", line)
        for sentence in partial_sentences:
            sentences.append(sentence.strip())

    # Track the words in the text
    for sentence in sentences:
        sentence_words = sentence.split(" ")
        for word in sentence_words:
            word = regex.sub("", word.strip())
            words.append(word)

    # Filter for empty strings
    filtered_words = list(filter(lambda x: len(x) > 0, words))

    # Count the number of words
    num_words = len(filtered_words)
    return num_words


df_extracted.loc[:, "num_content_tokens"] = df_extracted[
    "extracted_content_body"
].apply(lambda x: num_tokens_from_string(x))

df_extracted.loc[:, "word_count"] = df_extracted["extracted_content_body"].apply(
    lambda x: calculate_word_count(x)
)

display(  # noqa: F821
    df_extracted.sort_values(by=["num_content_tokens"], ascending=False)
)

In [None]:
total_tokens_across_articles = df_extracted["num_content_tokens"].sum()
print(f"Total Tokens across articles: {total_tokens_across_articles}")

In [None]:
fig = px.histogram(df_extracted, x="num_content_tokens", nbins=80)
fig.update_layout(
    title_text="Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
print(
    df_extracted["num_content_tokens"].quantile(
        [0.001, 0.021, 0.159, 0.5, 0.841, 0.977, 0.999]
    )
)

In [None]:
df_extracted.loc[:, "log_num_tokens"] = np.log(df_extracted["num_content_tokens"])

In [None]:
fig = px.histogram(df_extracted, x="log_num_tokens", nbins=80)
fig.update_layout(
    title_text="Log Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
fig = px.box(df_extracted, x="log_num_tokens", color="content_category")
fig.update_layout(
    title_text="Box Plot for Log Token Count for Extracted Article Content",
    xaxis_title_text="Log Token Count",
)
fig.show()

In [None]:
df_hpb = df_extracted[df_extracted["pr_name"] == "Health Promotion Board"]

display(df_hpb.sort_values(by=["num_content_tokens"], ascending=False))  # noqa: F821

In [None]:
fig = px.histogram(df_hpb, x="num_content_tokens", nbins=80)
fig.update_layout(
    title_text="Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
fig = px.histogram(df_hpb, x="log_num_tokens", nbins=80)
fig.update_layout(
    title_text="Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
fig = px.box(df_hpb, x="log_num_tokens", color="content_category")
fig.update_layout(
    title_text="Box Plot for Log Token Count for Extracted Article Content",
    xaxis_title_text="Log Token Count",
)
fig.show()

In [None]:
fig = px.scatter(
    df_hpb,
    x="num_content_tokens",
    y="word_count",
    color="content_category",
    trendline="ols",
    trendline_scope="overall",
)
fig.update_layout(
    title_text="Scatterplot of Token Count with respect to Word Count",
    xaxis_title_text="Token Count",
    yaxis_title_text=" Word Count",
)
fig.show()

In [None]:
results = px.get_trendline_results(fig)
print(results)

results.px_fit_results.iloc[0].summary()

Based on the statistics above, 1 token ~= 0.75 words. We will use this measure going forward

## Expected Cost Calculation

### Cost of OpenAI models

In [None]:
## Costs per 1000 tokens
models_costs = {
    "gpt-3.5-turbo-0125": {"inputs": 0.0005, "outputs": 0.0015},
    "gpt-4o": {"inputs": 0.005, "outputs": 0.015},
    "gpt-4o-mini": {"inputs": 0.15 / 1000, "outputs": 0.60 / 1000},
    "gpt-3.5-turbo-0301": {"inputs": 0.002, "outputs": 0.002},
    "gpt-3.5-turbo-instruct": {"inputs": 0.0015, "outputs": 0.002},
    "gpt-3.5-turbo-0613-16k": {"inputs": 0.003, "outputs": 0.004},
    "gpt-3.5-turbo-1106": {"inputs": 0.001, "outputs": 0.002},
}

print(models_costs)

#### Number of Agents, Max Number of Similar Articles, Percentage of articles to optimise/harmonise

In [None]:
## Optimisation
# Meta Desc Quality Eval, Title Quality Eval, Content Quality Eval, Researcher, Meta Desc Optimiser, Title Optimiser, Content Guidelines, Writing Guidelines
# Percentage of articles to optimise
percentage_to_optimise = 0.4

## Harmonisation
# Researcher, Compiler, Meta Desc Optimiser, Title Optimiser, Content Guidelines, Writing Guidelines
# Number of similar articles (max. 5)
similar_articles_count = 5

# Percentage of articles to combine/harmonise
percentage_to_harmonise = 0.3

In [None]:
# Generated evaluations
generated_evaluation_tokens = 200  # Approx. 150 words for generated text from evaluating meta desc, title and content quality each respectively

# Title and Meta Desc tokens
title_tokens = 25  # Approx. 70 characters / 16 words
meta_desc_tokens = 75  # Approx 50 words

# Generated Output Multiplier (with respect to article tokens)
keypoints_generated_multiplier = 1.1
compiled_articles_multiplier = (
    0.8  # An estimate of the generated article as compared to compiled articles
)
content_guidelines_multiplier = 1.25
writing_guidelines_multiplier = content_guidelines_multiplier * 1.25

## Prompt Tokens
evaluation_prompt_tokens = 200  # Approx. 150 words for evaluating meta desc, title and content quality respectively
researcher_prompt_tokens = 400  # Approx. 300 words for instructing researcher
compiler_prompt_tokens = 400  # Approx. 300 words for instructing compiler
guidelines_prompt_tokens = 600  # Approx. 450 words for guidelines
meta_desc_prompt_tokens = 200  # Approx. 150 words for prompt instructions
title_prompt_tokens = 200  # Approx. 150 words for prompt instructions


# USD to SGD Conversion
usd_to_sgd = 1.35

tokens_dict = {
    "percentage_to_optimise": percentage_to_optimise,
    "similar_articles_count": similar_articles_count,
    "percentage_to_harmonise": percentage_to_harmonise,
    "title_tokens": title_tokens,
    "meta_desc_tokens": meta_desc_tokens,
    "keypoints_generated_multiplier": keypoints_generated_multiplier,
    "compiled_articles_multiplier": compiled_articles_multiplier,
    "content_guidelines_multiplier": content_guidelines_multiplier,
    "writing_guidelines_multiplier": writing_guidelines_multiplier,
    "evaluation_prompt_tokens": evaluation_prompt_tokens,
    "researcher_prompt_tokens": researcher_prompt_tokens,
    "compiler_prompt_tokens": compiler_prompt_tokens,
    "guidelines_prompt_tokens": guidelines_prompt_tokens,
    "meta_desc_prompt_tokens": meta_desc_prompt_tokens,
    "title_prompt_tokens": title_prompt_tokens,
    "usd_to_sgd": usd_to_sgd,
}

In [None]:
def calculate_tokens_optimisation(
    article_tokens: int, params: dict[str, int] = tokens_dict
) -> int:
    # Prompt Tokens
    evaluation_prompt_tokens = params.get("evaluation_prompt_tokens", 200)
    researcher_prompt_tokens = params.get("researcher_prompt_tokens", 400)
    guidelines_prompt_tokens = params.get("guidelines_prompt_tokens", 600)
    meta_desc_prompt_tokens = params.get("meta_desc_prompt_tokens", 200)
    title_prompt_tokens = params.get("title_prompt_tokens", 200)

    # Generated Tokens
    title_tokens = params.get("title_tokens", 25)
    meta_desc_tokens = params.get("meta_desc_tokens", 75)
    generated_evaluation_tokens = params.get("generated_evaluation_tokens", 200)

    # Multipliers
    keypoints_generated_multiplier = params.get("keypoints_generated_multiplier", 0.5)
    content_guidelines_multiplier = params.get("content_guidelines_multiplier", 1.25)
    writing_guidelines_multiplier = params.get("writing_guidelines_multiplier", 1.5)

    # Meta Desc, Title and Content Quality Evaluation - Token Calculation
    evaluation_input_tokens = (
        evaluation_prompt_tokens * 3 + article_tokens + title_tokens + meta_desc_tokens
    )
    evaluation_output_tokens = generated_evaluation_tokens * 3

    # Researcher -> Keypoints - Token Calculation
    researcher_input_tokens = researcher_prompt_tokens + article_tokens
    researcher_output_tokens = article_tokens * keypoints_generated_multiplier

    # Content Guidelines - Token Calculation
    content_guidelines_input_tokens = (
        guidelines_prompt_tokens + researcher_output_tokens
    )
    content_guidelines_output_tokens = article_tokens * content_guidelines_multiplier

    # Writing Guidelines - Token Calculation
    writing_guidelines_input_tokens = (
        guidelines_prompt_tokens + content_guidelines_output_tokens
    )
    writing_guidelines_output_tokens = article_tokens * writing_guidelines_multiplier

    # Title Optimisation - Token Calculation
    title_optimisation_input_tokens = (
        title_prompt_tokens + writing_guidelines_output_tokens
    )
    title_optimisation_output_tokens = title_tokens

    # Meta Desc Optimisation - Token Calculation
    meta_desc_input_tokens = meta_desc_prompt_tokens + writing_guidelines_output_tokens
    meta_desc_output_tokens = meta_desc_tokens

    # Total Tokens Calculation
    total_input_tokens = math.ceil(
        evaluation_input_tokens
        + researcher_input_tokens
        + content_guidelines_input_tokens
        + writing_guidelines_input_tokens
        + title_optimisation_input_tokens
        + meta_desc_input_tokens
    )
    total_output_tokens = math.ceil(
        evaluation_output_tokens
        + researcher_output_tokens
        + content_guidelines_output_tokens
        + writing_guidelines_output_tokens
        + title_optimisation_output_tokens
        + meta_desc_output_tokens
    )

    return total_input_tokens, total_output_tokens

In [None]:
def calculate_optimisation_costs(
    article_tokens: int,
    model: str = "gpt-3.5-turbo-0125",
    params: dict[str, int] = tokens_dict,
) -> int:
    usd_to_sgd = params.get("usd_to_sgd", 1.35)

    inputs, outputs = calculate_tokens_optimisation(article_tokens, params)
    optimisation_cost = (
        (
            inputs * models_costs[model]["inputs"]
            + outputs * models_costs[model]["outputs"]
        )
        / 1000
        * usd_to_sgd
    )

    return optimisation_cost

In [None]:
model = "gpt-3.5-turbo-0125"
col = f"optimise_costs_{model}"
df_extracted[col] = df_extracted["num_content_tokens"].apply(
    lambda x: calculate_optimisation_costs(x, model, tokens_dict)
)

display(df_extracted)  # noqa: F821

In [None]:
model = "gpt-4o-mini"
col = f"optimise_costs_{model}"
df_extracted[col] = df_extracted["num_content_tokens"].apply(
    lambda x: calculate_optimisation_costs(x, model, tokens_dict)
)

display(df_extracted)  # noqa: F821

In [None]:
model = "gpt-3.5-turbo-0301"
col = f"optimise_costs_{model}"
df_extracted[col] = df_extracted["num_content_tokens"].apply(
    lambda x: calculate_optimisation_costs(x, model, tokens_dict)
)

display(df_extracted)  # noqa: F821

In [None]:
df_extracted.to_excel("../data/articles_optimisation_costs.xlsx", index=False)

In [None]:
def calculate_tokens_harmonisation(
    article_tokens_list,
    params: dict[str, int] = tokens_dict,
) -> int:
    # Prompt Tokens
    researcher_prompt_tokens = params.get("researcher_prompt_tokens", 400)
    compiler_prompt_tokens = params.get("compiler_prompt_tokens", 400)
    guidelines_prompt_tokens = params.get("guidelines_prompt_tokens", 600)
    meta_desc_prompt_tokens = params.get("meta_desc_prompt_tokens", 200)
    title_prompt_tokens = params.get("title_prompt_tokens", 200)

    # Generated Tokens
    title_tokens = params.get("title_tokens", 25)
    meta_desc_tokens = params.get("meta_desc_tokens", 75)

    # Multipliers
    keypoints_generated_multiplier = params.get("keypoints_generated_multiplier", 0.5)
    compiled_articles_multiplier = params.get("compiled_articles_multiplier", 0.8)
    content_guidelines_multiplier = params.get("content_guidelines_multiplier", 1.25)
    writing_guidelines_multiplier = params.get("writing_guidelines_multiplier", 1.5)

    researcher_input_tokens = 0
    researcher_output_tokens = 0

    # Researcher -> Keypoints
    for i in range(len(article_tokens_list)):
        researcher_input_tokens += researcher_prompt_tokens + article_tokens_list[i]
        researcher_output_tokens += (
            article_tokens_list[i] * keypoints_generated_multiplier
        )

    # Compiler
    compiler_input_tokens = compiler_prompt_tokens + researcher_output_tokens
    compiler_output_tokens = researcher_output_tokens

    # Content Guidelines
    content_guidelines_input_tokens = guidelines_prompt_tokens + compiler_output_tokens
    content_guidelines_output_tokens = (
        sum(article_tokens_list)
        * compiled_articles_multiplier
        * content_guidelines_multiplier
    )

    # Writing Guidelines
    writing_guidelines_input_tokens = (
        guidelines_prompt_tokens + content_guidelines_output_tokens
    )
    writing_guidelines_output_tokens = (
        sum(article_tokens_list)
        * compiled_articles_multiplier
        * writing_guidelines_multiplier
    )

    # Title Optimisation
    title_optimisation_input_tokens = (
        title_prompt_tokens + writing_guidelines_output_tokens
    )
    title_optimisation_output_tokens = title_tokens

    # Meta Desc Optimisation
    meta_desc_input_tokens = meta_desc_prompt_tokens + writing_guidelines_output_tokens
    meta_desc_output_tokens = meta_desc_tokens

    # Total
    total_input_tokens = math.ceil(
        researcher_input_tokens
        + compiler_input_tokens
        + content_guidelines_input_tokens
        + writing_guidelines_input_tokens
        + title_optimisation_input_tokens
        + meta_desc_input_tokens
    )
    total_output_tokens = math.ceil(
        researcher_output_tokens
        + compiler_output_tokens
        + content_guidelines_output_tokens
        + writing_guidelines_output_tokens
        + title_optimisation_output_tokens
        + meta_desc_output_tokens
    )
    return total_input_tokens, total_output_tokens

In [None]:
def calculate_costs(tokens_array, model, params: dict[str, int] = tokens_dict):
    usd_to_sgd = params.get("usd_to_sgd", 1.35)

    rng = np.random.default_rng()
    rng.shuffle(tokens_array)
    # print(tokens_array)

    optimization_size = round(percentage_to_optimise * tokens_array.size)
    # print(optimization_size)

    harmonisation_size = int(
        (percentage_to_harmonise * tokens_array.size // similar_articles_count + 1)
        * similar_articles_count
    )
    # print(harmonisation_size)

    optimization_tokens_array = tokens_array[:optimization_size]
    harmonisation_tokens_array = tokens_array[
        optimization_size : optimization_size + harmonisation_size
    ].reshape(-1, similar_articles_count)
    # print(len(harmonisation_tokens_array))

    optimisation_cost = 0
    for i in range(optimization_tokens_array.shape[0]):
        inputs, outputs = calculate_tokens_optimisation(optimization_tokens_array[i])
        optimisation_cost += (
            (
                inputs * models_costs[model]["inputs"]
                + outputs * models_costs[model]["outputs"]
            )
            / 1000
            * usd_to_sgd
        )  # Return cost in SGD

    print(f"Optimisation cost: {optimisation_cost}")

    harmonisation_cost = 0
    for i in range(harmonisation_tokens_array.shape[0]):
        inputs, outputs = calculate_tokens_harmonisation(harmonisation_tokens_array[i])
        harmonisation_cost += (
            (
                inputs * models_costs[model]["inputs"]
                + outputs * models_costs[model]["outputs"]
            )
            / 1000
            * usd_to_sgd
        )  # Return cost in SGD

    print(f"Harmonisation cost: {harmonisation_cost}")

    cost = optimisation_cost + harmonisation_cost

    return cost

In [None]:
def calculate_average_costs(tokens_array, model, runs=30):
    costs = []
    for i in range(runs):
        cost = calculate_costs(tokens_array, model)
        costs.append(cost)

    avg_cost = sum(costs) / runs
    print(tokens_array.shape[0])
    print(f"Average cost: {avg_cost}")
    avg_cost_per_article = avg_cost / (
        (percentage_to_optimise + percentage_to_harmonise) * tokens_array.shape[0]
    )
    print(f"Average cost Per article: {avg_cost_per_article}")

    return avg_cost

### All articles

In [None]:
tokens_array = np.copy(df_extracted["num_content_tokens"].to_numpy())
print(tokens_array)

In [None]:
calculate_costs(tokens_array, model="gpt-4o-mini")

In [None]:
calculate_average_costs(
    tokens_array, model="gpt-3.5-turbo-0301"
)  # gpt-3.5-turbo-0125, gpt-4o-mini, gpt-4o

### HPB Articles

In [None]:
hpb_tokens_array = np.copy(df_hpb["num_content_tokens"].to_numpy())
print(tokens_array)

In [None]:
calculate_costs(hpb_tokens_array, model="gpt-3.5-turbo-0301")

In [None]:
calculate_average_costs(
    hpb_tokens_array, model="gpt-3.5-turbo-0301"
)  # gpt-3.5-turbo-0125, gpt-4o-mini, gpt-4o