# GPT cost estimation

see https://github.com/orgs/SocialChangeLab/projects/2/views/1?pane=issue&itemId=64335549

In [1]:
from media_impact_monitor.data_loaders.news_online.mediacloud_ import (
    get_mediacloud_counts,
    get_mediacloud_fulltexts,
)
from media_impact_monitor.trends.keyword_trend import topic_queries
import pandas as pd
from datetime import date

# suppress all warnings
import warnings

warnings.filterwarnings("ignore")

In [2]:
from tokencost import calculate_prompt_cost, calculate_completion_cost

model = "gpt-3.5-turbo-1106"


def calculate_cost(text):
    prompt = [{"role": "user", "content": text}]
    completion = '{"reasoning": "This is a very cheerful text.", "sentiment": "+1"}'

    prompt_cost = calculate_prompt_cost(prompt, model)
    completion_cost = calculate_completion_cost(completion, model)
    cost = prompt_cost + completion_cost
    return cost

ERROR:root:Failed to update token costs. Using static costs.


In [3]:
from faker import Faker
from tqdm.auto import tqdm


def average_cost(query):
    fake = Faker()
    fake.seed_instance(0)
    fulltexts = []
    for _ in tqdm(range(10)):
        date_ = fake.date_between(
            start_date=date(2022, 1, 1), end_date="now"
        )  # random date
        _fulltexts = get_mediacloud_fulltexts(
            query=query,
            countries=["Germany"],
            start_date=date_,
            end_date=date_,
        )
        if _fulltexts is not None:
            fulltexts.append(_fulltexts)
    fulltexts = pd.concat(fulltexts)
    fulltexts["cost"] = fulltexts["text"].apply(calculate_cost)
    return fulltexts["cost"].mean()

In [4]:
def cost_for_topic(query):
    counts = get_mediacloud_counts(query=query, countries=["Germany"])
    counts.index = pd.to_datetime(counts.index)
    # counts.resample("Y").sum().plot()
    avg_cost = average_cost(query)
    counts_per_year = counts.resample("YE").sum().mean()
    cost_per_year = avg_cost * counts_per_year
    return counts_per_year, cost_per_year

In [5]:
queries = topic_queries(media_source="news_online")
queries

{'activism': '(klimawandel OR klimaerwärmung OR erderwärmung OR klimaschutz OR klimagerechtigkeit OR klimapolitik OR klimaneutral* OR klimaneutral* OR klimaziel* OR klimaschutzpaket OR klimaschutzgesetz OR klimaschutzmaßnahmen OR klimaschutzabkommen OR klimaschutzprogramm OR kohleausstieg OR "erneuerbare energie*" OR bürgerrat OR gesellschaftsrat OR tempolimit OR "tempo 100" OR 9-euro-ticket OR neun-euro-ticket OR vergesellschaftung OR schuldenschnitt OR klimakrise OR klimakatastrophe OR klimakollaps OR klimanotstand OR klimagerechtigkeit) AND (\\*protest* OR \\*demo OR \\*demonstr* OR \\*kundgebung OR versamm* OR \\*besetz* OR \\*streik* OR \\*blockade OR \\*blockier* OR sitzblock* OR \\*aktivis* OR \\*marsch OR \\*parade OR mahnwache OR hungerstreik OR "ziviler ungehorsam")',
 'science': 'klimawandel OR klimaerwärmung OR erderwärmung OR klimaschutz OR klimagerechtigkeit OR klimapolitik OR klimaneutral*',
 'policy': 'klimaneutral* OR klimaziel* OR klimaschutzpaket OR klimaschutzgesetz

In [None]:
costs = dict()
counts = dict()
for topic, query in list(queries.items())[:2]:
    count, cost = cost_for_topic(query)
    costs[topic] = cost
    counts[topic] = count

In [8]:
counts

{'activism': 2017.6666666666667, 'science': 21741.0}

In [9]:
costs

{'activism': 4.737481333333333, 'science': 27.284836842391304}