# Testing LLM by directly sending HTML

In [17]:
import os 
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

dir = os.environ['filepath']
filename = "test.html"
filepath = os.path.join(dir, filename)

with open(filepath, 'r', encoding='utf-8') as file:
    html = file.read()

html

'<div jscontroller="AtSb" class="w7Dbne CR1S4b" data-record-click-time="false" id="tsuid_48" jsdata="zt2wNd;_;A+m88g WDO8Ff;_;A+m88s" jsaction="rcuQ6b:npT2md;e3EWke:kN9HDb" data-hveid="CC4QAA">\n    <div jsname="jXK9ad" class="uMdZh tIxNaf rllt__borderless" jsaction="mouseover:UI3Kjd;mouseleave:Tx5Rb;focusin:UI3Kjd;focusout:Tx5Rb">\n      <div class="VkpGBb">\n        <div class="cXedhc">\n          <a class="vwVdIc wzN8Ac rllt__link a-no-hover-decoration" jsname="kj0dLd" data-cid="11265938073076301333" jsaction="click:h5M12e;" role="link" tabindex="0" data-ved="2ahUKEwjC7OW8romDAxXwFVkFHbmLAEoQ1YkKegQILhAB">\n            <div>\n              <div class="rllt__details">\n                <div class="dbg0pd" aria-level="3" role="heading"><span class="OSrXXb">Houndstooth Coffee</span></div>\n                <div><span><span class="Y0A0hc"><span class="yi40Hd YrbPuc" aria-hidden="true">4.6</span><span class="z3HNkc fUNJzc" aria-label="Rated 4.6 out of 5," role="img"><span style="width:12px

In [18]:
key = os.environ['HF_KEY']

## Using Ollama Model

In [3]:
import json 

filename = "test.html"
with open(filename, 'r', encoding='utf-8') as file:
    html_ = file.read()

html_

'<div jscontroller="AtSb" class="w7Dbne CR1S4b" data-record-click-time="false" id="tsuid_48" jsdata="zt2wNd;_;A+m88g WDO8Ff;_;A+m88s" jsaction="rcuQ6b:npT2md;e3EWke:kN9HDb" data-hveid="CC4QAA">\n    <div jsname="jXK9ad" class="uMdZh tIxNaf rllt__borderless" jsaction="mouseover:UI3Kjd;mouseleave:Tx5Rb;focusin:UI3Kjd;focusout:Tx5Rb">\n      <div class="VkpGBb">\n        <div class="cXedhc">\n          <a class="vwVdIc wzN8Ac rllt__link a-no-hover-decoration" jsname="kj0dLd" data-cid="11265938073076301333" jsaction="click:h5M12e;" role="link" tabindex="0" data-ved="2ahUKEwjC7OW8romDAxXwFVkFHbmLAEoQ1YkKegQILhAB">\n            <div>\n              <div class="rllt__details">\n                <div class="dbg0pd" aria-level="3" role="heading"><span class="OSrXXb">Houndstooth Coffee</span></div>\n                <div><span><span class="Y0A0hc"><span class="yi40Hd YrbPuc" aria-hidden="true">4.6</span><span class="z3HNkc fUNJzc" aria-label="Rated 4.6 out of 5," role="img"><span style="width:12px

In [2]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate

messages = [
    SystemMessage(
        content="""You are an expert in HTML parsing for E-commerce HTML blocks. Extract information about product, its prices and description from HTML in JSON Format along with the related CSS selector or XPaths.
                Remember to return response in JSON format as:
                product: Name of the product
                price: Price of the product
                desc: Information about the product
                img: Images associated with the product
                tag: CSS selectors or Xpaths"""
    ),
    HumanMessage(
        content="{html_}"
    )
]
prompt = ChatPromptTemplate.from_messages(messages)
html = json.dumps(html_, indent=2)

In [3]:
html

'"<div jscontroller=\\"AtSb\\" class=\\"w7Dbne CR1S4b\\" data-record-click-time=\\"false\\" id=\\"tsuid_48\\" jsdata=\\"zt2wNd;_;A+m88g WDO8Ff;_;A+m88s\\" jsaction=\\"rcuQ6b:npT2md;e3EWke:kN9HDb\\" data-hveid=\\"CC4QAA\\">\\n    <div jsname=\\"jXK9ad\\" class=\\"uMdZh tIxNaf rllt__borderless\\" jsaction=\\"mouseover:UI3Kjd;mouseleave:Tx5Rb;focusin:UI3Kjd;focusout:Tx5Rb\\">\\n      <div class=\\"VkpGBb\\">\\n        <div class=\\"cXedhc\\">\\n          <a class=\\"vwVdIc wzN8Ac rllt__link a-no-hover-decoration\\" jsname=\\"kj0dLd\\" data-cid=\\"11265938073076301333\\" jsaction=\\"click:h5M12e;\\" role=\\"link\\" tabindex=\\"0\\" data-ved=\\"2ahUKEwjC7OW8romDAxXwFVkFHbmLAEoQ1YkKegQILhAB\\">\\n            <div>\\n              <div class=\\"rllt__details\\">\\n                <div class=\\"dbg0pd\\" aria-level=\\"3\\" role=\\"heading\\"><span class=\\"OSrXXb\\">Houndstooth Coffee</span></div>\\n                <div><span><span class=\\"Y0A0hc\\"><span class=\\"yi40Hd YrbPuc\\" aria-hidden

In [4]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser

llm = ChatOllama(model="llama2", format="json", temperature=0)
chain = prompt | llm | StrOutputParser()

chain.invoke({"html": html})

'{\n"product": "Smartwatch",\n"price": "$199.99",\n"desc": "<p>Stay connected and on top of your game with this sleek smartwatch. Track your fitness goals, receive notifications, and control your music all from one convenient device.</p>",\n"img": ["https://example.com/smartwatch1.jpg", "https://example.com/smartwatch2.jpg"],\n"tag": ["div.product-info", "span.price", "p.description"]\n}\n\n\n'

## Using Ollama Functions

In [11]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions

model = OllamaFunctions(model="llama2",
                        format="json")
model = model.bind_tools(
    tools=[{
        "name": "get_parsed_html_in_json",
        "description": "Get the parsed HTML in JSON format",
        "parameters":[{
            "product": "Name of the product",
            "price": "Price of the product",
            "desc": "Information about the product",
            "img": "Images associated with the product",
            "tag": "CSS selectors or Xpaths"}]
    }],
    function_call={"name": "get_parsed_html_in_json"},
)
model

RunnableBinding(bound=OllamaFunctions(format='json'), kwargs={'functions': [{'name': 'get_parsed_html_in_json', 'description': 'Get the parsed HTML in JSON format', 'parameters': [{'product': 'Name of the product', 'price': 'Price of the product', 'desc': 'Information about the product', 'img': 'Images associated with the product', 'tag': 'CSS selectors or Xpaths'}]}], 'function_call': {'name': 'get_parsed_html_in_json'}})

In [None]:
model.invoke(html)

In [4]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class Output(BaseModel):
    product: str = Field(description="Name of the product", required=True)
    price: str = Field(description="Price of the product",)
    desc: str = Field(description="Information about the product")
    img: str = Field(description="Images associated with the product")
    tag: str = Field("CSS selectors of Xpaths", required=True)

prompt = PromptTemplate.from_template(
    """You are an expert in HTML parsing for E-commerce HTML blocks. Extract information about product, its prices and description from HTML in JSON Format along with the related CSS selector or XPaths.
Remember to return response in JSON format

Human: {html}
AI: """
)

llm = OllamaFunctions(
    model="llama3",
    format="json",
    temperature=0
)

struct_llm = llm.with_structured_output(Output)
chain = prompt | struct_llm
html = json.dumps(html_, indent=2)
chain.invoke(html)

Output(product='Houndstooth Coffee', price='$$', desc='Cozy hangout for carefully sourced brews', img='https://lh5.googleusercontent.com/p/AF1QipNRZ1ehiInk8CTrHCD08GnhcnF7e4q-1H8Qs8mG=w114-h114-n-k-no', tag='.VkpGBb .cXedhc a.wzN8Ac.rllt__link.a-no-hover-decoration')