# Imports


In [1]:
from scraper import Scraper
from schema import Schema
import utils
from dotenv import dotenv_values

config = dotenv_values("../.env")

In [2]:
from pydantic import BaseModel, Field
from typing import List

import openai
import json
import time

In [3]:
scraper = Scraper(config["URL"])
table_rows = scraper.get_table()

# Pydantic schemas


In [4]:
class MovieItem(Schema):
    """
    class representing the entry of the highest grossing movie at the box office in any week of 2023 along the grossing amount and some remarks
    """

    weekend_end_date: str = Field(
        ...,
        alias="Weekend end date",
        description="Stores the last date of the week in the year 2023",
    )
    film: str = Field(..., alias="Film", description="Name of the movie")
    gross: str = Field(
        ..., alias="Gross", description="Gross income of the movie in the given weekend"
    )
    notes: str = Field(..., alias="Notes", description="Some notes about the movie")


class BoxOffice(Schema):
    """Class representing the list of the highest grossing movies per weekend of 2023"""

    items: List[MovieItem] = Field(
        ..., description="List of the highest grossing movies each weekend of 2023"
    )

# Query OpenAI API


In [14]:
def html_to_json_ai(data: str) -> BoxOffice:
    client = openai.OpenAI(
        api_key=config["OPENAI_AI_KEY"],
    )
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a world class algorithm to convert html to structured data",
            },
            {
                "role": "user",
                "content": f"Convert the following HTML string to structured data as per given format",
            },
            {"role": "user", "content": f"HTML input: ```{data}```"},
            {
                "role": "user",
                "content": "Tips: Make sure to pay attention to the attributes of the HTML tags, especialy the rowspan attributes",
            },
        ],
        temperature=0.2,
        functions=[BoxOffice.custom_schema],
        function_call={"name": BoxOffice.custom_schema["name"]},
    )
    input_tokens = completion.usage.prompt_tokens
    output_tokens = completion.usage.completion_tokens

    input_charge = (input_tokens / 1000) * 0.0010
    output_charge = (output_tokens / 1000) * 0.0020

    print(
        f"Token counts:\nInput tokens = {input_tokens}\nOutput tokens = {output_tokens}\nTotal tokens = {input_tokens + output_tokens}\n"
    )
    print(
        f"Charge:\nInput charge = ${input_charge}\nOutput charge = ${output_charge}\nTotal charge ${input_charge + output_charge}"
    )
    return BoxOffice.from_response(completion).model_dump_json()

In [6]:
html_chunks = utils.html_chunker(table_rows, chunk_size=4)

In [9]:
html_input = utils.html_to_str(html_chunks[0])

In [11]:
output = html_to_json_ai(html_input)

Token counts:
Input tokens = 637
Output tokens = 426
Total tokens = 1063
Charge:
Input charge = $0.000637
Output charge = $0.000852
Total charge $0.0014889999999999999


In [12]:
print(json.dumps(json.loads(output), indent=4))

{
    "items": [
        {
            "weekend_end_date": "January 8, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$45,838,986",
            "notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for four consecutive weekends each since The Hunger Games: Mockingjay \u2013 Part 2 and Star Wars: The Force Awakens in 2015 and 2016."
        },
        {
            "weekend_end_date": "January 15, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$32,824,684",
            "notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for five consecutive weekends each since Stakeout and Fatal Attraction in 1987."
        },
        {
            "weekend_end_date": "January 22, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$20,133,106",
            "note

# Total output


In [None]:
html_chunks = utils.html_chunker(table_rows)
for i, chunk in enumerate(html_chunks):
    html_input = utils.html_to_str(chunk)
    print(f"\nChunk {i}")
    output = html_to_json_ai(html_input)
    utils.save_json(output, filepath=f"../chunk-{i}.json")
    time.sleep(30)