In [2]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
import openai
from openai import OpenAI
import os


template = (
    'You are tasked with extracting specific information from the following text content: {dom_chunks}.'
    'Please follow these instructions carefully: \n\n'
    ' 1. ** Extract Information** : Only extract the information that directly matches the provided description {parse_description}'
    ' 2. ** No Extra Content** : Do not include any additional text, comments or explanations in your response.'
    ' 3. ** Empty Response** : If no information matches the description return to an empty string'
    ' 4. ** Direct Data** : Your response should contain only data that is explicitly requested, with no other text.'
    ' 5. ** Format** : Whenever possible always provide your data as a table with rows and columns.'
)

model = OllamaLLM(model='llama3.2')

def parse_with_ollama(dom_chunks, parse_description):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    parsed_results = []

    for i, chunk in enumerate(dom_chunks, start = 1):
        response = chain.invoke({'dom_chunks' : chunk, 'parse_description' : parse_description })

        print(f'parse batch {i} of {len(dom_chunks)}')
        parsed_results.append(response)

    return '\n'.join(parsed_results)


def parse_with_chatgpt(dom_chunks, parse_description):
       """
       Sends chunks of website content to OpenAI's GPT API for parsing based on a description.
       """

       # Loop through chunks and get responses
       parsed_results = []
       for i, chunk in enumerate(dom_chunks, start=1):
           prompt = template.format(dom_chunks=chunk, parse_description=parse_description)
           print(f"Processing chunk {i}/{len(dom_chunks)}...")

           try:
               client = OpenAI(
                    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)
               response = client.chat.completions.create(
                   model="gpt-3.5-turbo",
                   messages=[
                       {"role": "system", "content": "You are an advanced data extraction assistant."},
                       {"role": "user", "content": prompt}
                   ],
                   temperature=0.0  # For deterministic responses
               )

               # Extract the content of the assistant's reply
               parsed_results.append(response)
                # parsed_results.append(response['choices'][0]["message"]["content"])
           except Exception as e:
               print(f"Error processing chunk {i}: {e}")
               parsed_results.append("")  # Append an empty string if there's an error

       return parsed_results
# return "\n".join(parsed_results)


In [3]:
dom_chunk = 'Old, ham, winter, christmas, banana chips'
parse_description = 'Please make up a short 5 sentence story based on the words provided previously'

In [4]:
result = parse_with_chatgpt(dom_chunk,parse_description)



Processing chunk 1/41...
Processing chunk 2/41...
Processing chunk 3/41...
Processing chunk 4/41...
Processing chunk 5/41...
Processing chunk 6/41...
Processing chunk 7/41...
Processing chunk 8/41...
Processing chunk 9/41...
Processing chunk 10/41...
Processing chunk 11/41...
Processing chunk 12/41...
Processing chunk 13/41...
Processing chunk 14/41...
Processing chunk 15/41...
Processing chunk 16/41...
Processing chunk 17/41...
Processing chunk 18/41...
Processing chunk 19/41...
Processing chunk 20/41...
Processing chunk 21/41...
Processing chunk 22/41...
Processing chunk 23/41...
Processing chunk 24/41...
Processing chunk 25/41...
Processing chunk 26/41...
Processing chunk 27/41...
Processing chunk 28/41...
Processing chunk 29/41...
Processing chunk 30/41...
Processing chunk 31/41...
Processing chunk 32/41...
Processing chunk 33/41...
Processing chunk 34/41...
Processing chunk 35/41...
Processing chunk 36/41...
Processing chunk 37/41...
Processing chunk 38/41...
Processing chunk 39/4

In [25]:
result[0].choices[0].message.content

'| Sentence |\n|----------|\n| Once upon a time, there was a young girl named O. She was given specific instructions to follow carefully. O was tasked with extracting specific information from text content. She created a short 5 sentence story based on the provided words. O made sure to include only the information that matched the description. In the end, O successfully completed the task without any extra content. |'

In [7]:
result

[ChatCompletion(id='chatcmpl-ATsuX6Saagoln5fjladGquEd1Mphz', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='| Sentence |\n|----------|\n| Once upon a time, there was a young girl named O. She was given specific instructions to follow carefully. O was tasked with extracting specific information from text content. She created a short 5 sentence story based on the provided words. O made sure to include only the information that matched the description. In the end, O successfully completed the task without any extra content. |', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1731685953, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=82, prompt_tokens=160, total_tokens=242, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_predict