In [1]:
from scraper import Scraper
from dotenv import dotenv_values

config = dotenv_values("../.env")

In [2]:
scraper = Scraper(config["URL"])
table_rows = scraper.get_table()

In [3]:
from pydantic import BaseModel, ValidationError, Field, create_model
from typing import (
    List,
    Optional,
    Callable,
    Type,
)
import openai
from openai.types.chat import (
    ChatCompletionMessage,
    ChatCompletionMessageParam,
)
import json
from functools import wraps
from docstring_parser import parse

## trial run


In [4]:
def create(
    client: openai.OpenAI,
    messages: List[dict],
    model_class: BaseModel,
    retry=2,
    temperature=0,
    **kwargs,
) -> BaseModel:
    messages.append(
        {
            "role": "system",
            "content": f"Please respond ONLY with valid json that conforms to this pydantic json_schema: {model_class.model_json_schema()}. Do not include additional text other than the object json as we will load this object with json.loads() and pydantic.",
        }
    )

    last_exception = None
    for i in range(retry + 1):
        response = client.chat.completions.create(
            messages=messages, temperature=temperature, **kwargs
        )
        assistant_message = response["choices"][0]["message"]
        content = assistant_message["content"]
        try:
            json_content = json.loads(content)
        except Exception as e:
            last_exception = e
            error_msg = f"json.loads exception: {e}"
            messages.append(assistant_message)
            messages.append({"role": "system", "content": error_msg})
            continue
        try:
            return model_class(**json_content)
        except ValidationError as e:
            last_exception = e
            error_msg = f"pydantic exception: {e}"
            messages.append(assistant_message)
            messages.append({"role": "system", "content": error_msg})
    raise last_exception

In [11]:
import together

In [12]:
together.api_key = config["MISTRAL_API_KEY"]

In [13]:
# see available models
model_list = together.Models.list()

print(f"{len(model_list)} models available")

# print the first 10 models on the menu
model_names = [model_dict["name"] for model_dict in model_list]
model_names[:10]

118 models available


['Austism/chronos-hermes-13b',
 'DiscoResearch/DiscoLM-mixtral-8x7b-v2',
 'EleutherAI/llemma_7b',
 'Gryphe/MythoMax-L2-13b',
 'Meta-Llama/Llama-Guard-7b',
 'Nexusflow/NexusRaven-V2-13B',
 'NousResearch/Nous-Capybara-7B-V1p9',
 'NousResearch/Nous-Hermes-Llama2-13b',
 'NousResearch/Nous-Hermes-Llama2-70b',
 'NousResearch/Nous-Hermes-llama-2-7b']

In [14]:
output = together.Complete.create(
    prompt="<human>: What are Isaac Asimov's Three Laws of Robotics?\n<bot>:",
    model="mistralai/Mistral-7B-Instruct-v0.1",
    max_tokens=256,
    temperature=0.8,
    top_k=60,
    top_p=0.6,
    repetition_penalty=1.1,
    stop=["<human>", "\n\n"],
)

# print generated text
print(output["output"]["choices"][0]["text"])

Isaac Asimov's Three Laws of Robotics are:
1. A robot may not injure a human being or, through inaction, allow a human being to come to harm.
2. A robot must obey orders given it by human beings, except where such orders would conflict with the First Law.
3. A robot must protect its own existence, as long as such protection does not conflict with the First or Second Laws.


In [60]:
client = openai.OpenAI(
    api_key=config["MISTRAL_API_KEY"],
    base_url="https://api.together.xyz",
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Describe the sport of football",
        }
    ],
    model="togethercomputer/falcon-40b-instruct",
    max_tokens=512,
)

print(chat_completion.choices[0].message.content)

Football is a popular sport played between two teams of 11 players each. The objective of the game is to score more points than the opposing team by kicking a ball into the opponent's goal. The game is played on a rectangular field with goalposts at each end. The ball is moved around the field by passing it between players or by dribbling it with the feet. The game is governed by a set of rules known as the Laws of the Game, which are enforced by referees.


In [26]:
chat_completion.choices[0].message.content.__repr__().split("\\n\\n")

['"Football, also known as soccer in some countries, is a popular sport played all around the world. It is a team sport that involves two teams of eleven players each, who aim to score goals by kicking a ball into the opposing team\'s net. The team with the most goals at the end of the game wins.',
 "The game is played on a rectangular field, called a pitch or a stadium, with a goal at either end. The teams take turns kicking the ball, with the aim of making it land in the opposing team's goal. Players can use any part of their body except for their arms and hands to move the ball, and only the goalkeeper is allowed to use their hands within their designated penalty area.",
 'Football is a fast-paced game that requires a lot of physical fitness, as well as skill and teamwork. Players must be able to run, sprint, tackle, and pass the ball with precision, while also defending their own goal and trying to score goals for their team.',
 'The game is officiated by a referee and two assistan

In [56]:
message = chat_completion.choices[0].message

In [58]:
message.function_call

## Schema creation


In [4]:
class Schema(BaseModel):
    @classmethod
    @property
    def custom_schema(cls):
        schema = cls.model_json_schema()
        docstring = parse(cls.__doc__ or "")
        parameters = {
            k: v for k, v in schema.items() if k not in ("title", "description")
        }
        for param in docstring.params:
            if (name := param.arg_name) in parameters["properties"] and (
                description := param.description
            ):
                if "description" not in parameters["properties"][name]:
                    parameters["properties"][name]["description"] = description

        parameters["required"] = sorted(
            k for k, v in parameters["properties"].items() if "default" not in v
        )

        if "description" not in schema:
            if docstring.short_description:
                schema["description"] = docstring.short_description
            else:
                schema["description"] = (
                    f"Correctly extracted `{cls.__name__}` with all "
                    f"the required parameters with correct types"
                )

        return {
            "name": schema["title"],
            "description": schema["description"],
            "parameters": parameters,
        }

    @classmethod
    def from_response(cls, chat, validation_context=None, strict: bool = None):
        message = chat["choices"][0]["message"]
        assert (
            message["function_call"]["name"] == cls.custom_schema["name"]
        ), "Function name does not match"
        return cls.model_validate_json(
            message["function_call"]["arguments"],
            context=validation_context,
            strict=strict,
        )


def convert_to_schema(cls) -> Schema:
    if not issubclass(cls, BaseModel):
        raise TypeError("Class must be subclass of pydantic.BaseModel")

    return wraps(cls, updated=())(create_model(cls.__name__, __base__=(cls, Schema)))

In [5]:
def handle_response_model(*, response_model: Type[BaseModel], kwargs):
    new_kwargs = kwargs.copy()
    if response_model is not None:
        if not issubclass(response_model, Schema):
            response_model = convert_to_schema(response_model)

    new_kwargs["functions"] = [response_model.custom_schema]
    new_kwargs["function_call"] = {"name": response_model.custom_schema["name"]}
    return response_model, new_kwargs


def process_response(
    response,
    *,
    response_model: Type[BaseModel],
    stream: bool,
    validation_context: dict = None,
    strict: bool = None,
):
    model = response_model.from_response(
        response.model_dump(), validation_context, strict=strict
    )
    return model


def dump_message(message: ChatCompletionMessage) -> ChatCompletionMessageParam:
    return_value: ChatCompletionMessageParam = {
        "role": message.role,
        "content": message.content or "",
    }
    if message.tool_calls is not None:
        return_value["content"] += json.dumps(message.model_dump()["tool_calls"])
    if message.function_call is not None:
        return_value["content"] += json.dumps(message.model_dump()["function_call"])
    return return_value


def retry(
    func,
    response_model,
    validation_context,
    args,
    kwargs,
    max_retries,
    strict: Optional[bool] = None,
):
    retries = 0
    while retries <= max_retries:
        try:
            print(args)
            print(kwargs)
            response = func(*args, **kwargs)
            print(response.model_dump())
            stream = kwargs.get("stream", False)
            return process_response(
                response,
                response_model=response_model,
                stream=stream,
                validation_context=validation_context,
                strict=strict,
            )
        except (ValidationError, json.JSONDecodeError) as e:
            kwargs["messages"].append(dump_message(response.choices[0].message))
            kwargs["messages"].append(
                {
                    "role": "user",
                    "content": f"Recall the function correctly, exceptions found\n{e}",
                }
            )
            retries += 1
            if retries > max_retries:
                print(f"Max retries reach, exception: {e}")
                raise e


def modified_chat_completion(func: Callable) -> Callable:
    @wraps(func)
    def new_chat_completion(
        response_model=None, validation_context=None, max_retries=1, *args, **kwargs
    ):
        response_model, new_kwargs = handle_response_model(
            response_model=response_model, kwargs=kwargs
        )
        print(new_kwargs)
        response = retry(
            func=func,
            response_model=response_model,
            validation_context=validation_context,
            max_retries=max_retries,
            args=args,
            kwargs=new_kwargs,
        )
        return response

    wrapper_function = new_chat_completion
    return wrapper_function


def modify_chat_completion(client: openai.OpenAI):
    client.chat.completions.create = modified_chat_completion(
        client.chat.completions.create
    )

## Moment of truth


In [6]:
client = openai.OpenAI(
    api_key=config["ANYSCALE_API_KEY"],
    base_url="https://api.endpoints.anyscale.com/v1",
)

In [8]:
modified_client = modify_chat_completion(client)

In [7]:
class Extraction(BaseModel):
    summary: str
    hypothetical_questions: List[str] = Field(
        default_factory=list,
        description="Hypothetical questions that this document could answer",
    )
    keywords: List[str] = Field(
        default_factory=list, description="Keywords that this document is about"
    )

In [9]:
Extraction.model_json_schema()

{'properties': {'summary': {'title': 'Summary', 'type': 'string'},
  'hypothetical_questions': {'description': 'Hypothetical questions that this document could answer',
   'items': {'type': 'string'},
   'title': 'Hypothetical Questions',
   'type': 'array'},
  'keywords': {'description': 'Keywords that this document is about',
   'items': {'type': 'string'},
   'title': 'Keywords',
   'type': 'array'}},
 'required': ['summary'],
 'title': 'Extraction',
 'type': 'object'}

In [7]:
# Define the schema for the output
class Result(BaseModel):
    winner_team_name: str
    loser_team_name: str
    winner_score: int
    loser_score: int


chat_completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.1",
    response_format={"type": "json_object", "schema": Result.model_json_schema()},
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant designed to output JSON.",
        },
        {"role": "user", "content": "Who won the world series in 2020?"},
    ],
    temperature=0.7,
)

In [9]:
print(chat_completion.choices[0].message.content)

 {
"winner_team_name": "Los Angeles Dodgers",
"winner_score": 3,
"loser_team_name": "Tampa Bay Rays",
"loser_score": 2
}


In [12]:
from datetime import date


class MovieItem(Schema):
    """
    class representing the entry of the highest grossing movie at the box office in any week of 2023 along the grossing amount and some remarks
    """

    weekend_end_date: str = Field(
        ...,
        alias="Weekend end date",
        description="Stores the last date of the week in the year 2023",
    )
    film: str = Field(..., alias="Film", description="Name of the movie")
    gross: str = Field(
        ..., alias="Gross", description="Gross income of the movie in the given weekend"
    )
    notes: str = Field(..., alias="Notes", description="Some notes about the movie")


class BoxOffice(Schema):
    """Class representing the list of the highest grossing movies per weekend of 2023"""

    items: List[MovieItem] = Field(
        ..., description="List of the highest grossing movies each weekend of 2023"
    )

In [19]:
def html_to_json_ai(
    data: str, model: str, api_key: str, url: str | None = None
) -> dict:
    """
    Function to convert the given raw HTML string into structured JSON object following a predefined schema
    by calling llm using api keys

    Parameters:
    - data (str):       the raw string containing the html input
    - model (str):      the name of the model endpoint to be called
    - api_key (str):    the api key used for calling the server endpoint
    - url (str | None): the base url used to make the query
                        default: None

    Output:
    json dumped from the BoxOffice object after parsing the model response in the needed format
    """
    client = openai.OpenAI(api_key=api_key, base_url=url)
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a world class algorithm to convert html to structured data",
            },
            {
                "role": "user",
                "content": f"Convert rows in given HTML to the specified JSON",
            },
            {"role": "user", "content": f"HTML input: ```{data}```"},
            # {
            #     "role": "user",
            #     "content": "Tips: Make sure to pay attention to the attributes of the HTML tags, especialy the rowspan attributes",
            # },
            # {"role": "user", "content": "Tips: pay attention to the dates"},
        ],
        temperature=0.2,
        response_format={
            "type": "json_object",
            "schema": BoxOffice.model_json_schema(),
        },
    )
    return completion

In [20]:
import utils

html_chunks = utils.html_chunker(table_rows, chunk_size=4)
html_input = utils.html_to_str(html_chunks[0])
output = html_to_json_ai(
    html_input,
    model="mistralai/Mistral-7B-Instruct-v0.1",
    api_key=config["ANYSCALE_API_KEY"],
    url="https://api.endpoints.anyscale.com/v1",
)

In [27]:
print(json.dumps(json.loads(output.choices[0].message.content), indent=4))

{
    "items": [
        {
            "Weekend end date": "January 8, 2023",
            "Film": "Avatar: The Way of Water",
            "Gross": "$45,838,986",
            "Notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for four consecutive weekends each since The Hunger Games: Mockingjay \u2013 Part 2 and Star Wars: The Force Awakens in 2015 and 2016."
        },
        {
            "Weekend end date": "January 15, 2023",
            "Film": "Avatar: The Way of Water",
            "Gross": "$32,824,684",
            "Notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for five consecutive weekends each since Stakeout and Fatal Attraction in 1987."
        },
        {
            "Weekend end date": "January 22, 2023",
            "Film": "Avatar: The Way of Water",
            "Gross": "$20,133,106",
            "Note

In [46]:
html_input = "\n\n".join(list(map(str, [table_rows[0]] + table_rows[10:20])))

In [29]:
print(
    json.dumps(
        json.loads(BoxOffice.from_response(completion).model_dump_json()), indent=4
    )
)

{
    "items": [
        {
            "weekend_end_date": "January 8, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$45,838,986",
            "notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for four consecutive weekends each since The Hunger Games: Mockingjay \u2013 Part 2 and Star Wars: The Force Awakens in 2015 and 2016."
        },
        {
            "weekend_end_date": "January 15, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$32,824,684",
            "notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for five consecutive weekends each since Stakeout and Fatal Attraction in 1987."
        },
        {
            "weekend_end_date": "January 22, 2023",
            "film": "Avatar: The Way of Water",
            "gross": "$20,133,106",
            "note

In [48]:
print(
    json.dumps(
        json.loads(BoxOffice.from_response(completion).model_dump_json()), indent=4
    )
)

{
    "items": [
        {
            "weekend_end_date": "March 12, 2023",
            "film": "Scream VI",
            "gross": "$44,447,270",
            "notes": ""
        },
        {
            "weekend_end_date": "March 19, 2023",
            "film": "Shazam! Fury of the Gods",
            "gross": "$30,111,158",
            "notes": ""
        },
        {
            "weekend_end_date": "March 26, 2023",
            "film": "John Wick: Chapter 4",
            "gross": "$73,817,950",
            "notes": ""
        },
        {
            "weekend_end_date": "April 2, 2023",
            "film": "Dungeons & Dragons: Honor Among Thieves",
            "gross": "$37,205,784",
            "notes": ""
        },
        {
            "weekend_end_date": "April 9, 2023",
            "film": "The Super Mario Bros. Movie",
            "gross": "$146,361,865",
            "notes": "The Super Mario Bros. Movie broke Sonic the Hedgehog 2's record ($72.1 million) for the highest weekend

In [38]:
print(completion.choices[0].message.function_call.arguments)

{
  "items": [
    {
      "Weekend end date": "January 8, 2023",
      "Film": "Avatar: The Way of Water",
      "Gross": "$45,838,986",
      "Notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for four consecutive weekends each since The Hunger Games: Mockingjay – Part 2 and Star Wars: The Force Awakens in 2015 and 2016."
    },
    {
      "Weekend end date": "January 15, 2023",
      "Film": "Avatar: The Way of Water",
      "Gross": "$32,824,684",
      "Notes": "Black Panther: Wakanda Forever and Avatar: The Way of Water became the first two films to consecutively top the box office for five consecutive weekends each since Stakeout and Fatal Attraction in 1987."
    },
    {
      "Weekend end date": "January 22, 2023",
      "Film": "Avatar: The Way of Water",
      "Gross": "$20,133,106",
      "Notes": "Avatar: The Way of Water became the first film since Avatar to top the box office for six cons

In [50]:
import tiktoken


def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
    }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = (
            4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        )
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print(
            "Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613."
        )
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print(
            "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613."
        )
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [54]:
model = "gpt-3.5-turbo"
example_messages = [
    {
        "role": "system",
        "content": "You are a world class algorithm to convert html to structured data",
    },
    {
        "role": "user",
        "content": f"Convert the following HTML string to structured data as per given format",
    },
    {"role": "user", "content": f"HTML input: ```{html_input}```"},
    {
        "role": "user",
        "content": "Tips: Make sure to pay attention to the attributes of the HTML tags, especialy the rowspan attributes",
    },
]
print(
    f"{num_tokens_from_messages(example_messages, model)} prompt tokens counted by num_tokens_from_messages()."
)
print(
    f"{num_tokens_from_messages(completion.choices[0].message.function_call.arguments, model)} generated tokens"
)

922 prompt tokens counted by num_tokens_from_messages().


AttributeError: 'str' object has no attribute 'items'

In [61]:
input_tokens = completion.usage.prompt_tokens
output_tokens = completion.usage.completion_tokens

input_charge = (input_tokens / 1000) * 0.0010
output_charge = (output_tokens / 1000) * 0.0020

print(
    f"Input charge = ${input_charge} | Output charge = ${output_charge} | Total charge = ${input_charge + output_charge}"
)

Input charge = $0.001012 | Output charge = $0.001572 | Total charge = $0.0025840000000000004


In [69]:
def html_chunker(rows: List, chunk_size: int = 10):
    header_row = [rows[0]]
    chunks = [
        header_row + rows[i : i + chunk_size] for i in range(1, len(rows), chunk_size)
    ]

    return chunks


def html_to_str(chunk):
    return "\n\n".join(list(map(str, chunk)))