In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import tiktoken

In [None]:
# Costs per 1000 tokens
models_costs = {
    "gpt-3.5-turbo-0125": {"inputs": 0.0005, "outputs": 0.0015},
    "gpt-4o-mini": {"inputs": 0.15 / 1000, "outputs": 0.60 / 1000},
}

In [None]:
merged_data = pd.read_parquet("../data/merged_data.parquet")

display(merged_data)

In [None]:
df_keep = merged_data[~merged_data["to_remove"]]

display(df_keep)

In [None]:
relevant_categories = [
    "cost-and-financing",
    "live-healthy-articles",
    "diseases-and-conditions",
    "medical-care-and-facilities",
    "support-group-and-others",
]

df_keep = df_keep[df_keep["content_category"].isin(relevant_categories)]

display(df_keep)

In [None]:
df_extracted = df_keep[
    [
        "id",
        "content_name",
        "title",
        "article_category_names",
        "full_url",
        "friendly_url",
        "category_description",
        "content_category",
        "content_body",
        "pr_name",
        "has_table",
        "has_image",
        "related_sections",
        "extracted_tables",
        "extracted_links",
        "extracted_headers",
        "extracted_img_alt_text",
        "extracted_content_body",
    ]
]

display(df_extracted)

In [None]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


df_extracted.loc[:, "num_tokens"] = df_extracted["extracted_content_body"].apply(
    lambda x: num_tokens_from_string(x)
)

display(df_extracted.sort_values(by=["num_tokens"], ascending=False))

In [None]:
fig = px.histogram(df_extracted, x="num_tokens", nbins=100)
fig.update_layout(
    title_text="Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()

In [None]:
df_extracted.loc[:, "log_num_tokens"] = np.log(df_extracted["num_tokens"])

In [None]:
fig = px.histogram(df_extracted, x="log_num_tokens", nbins=100)
fig.update_layout(
    title_text="Token Count distribution for Extracted Article Content",
    xaxis_title_text="Token Count",
    yaxis_title_text="Count",
    bargap=0.1,
)
# fig.add_vline(x=lower_threshold, line_dash="dash", line_color="firebrick")
# fig.add_vline(x=upper_threshold, line_dash="dash", line_color="firebrick")
fig.show()