In [26]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [7]:
from bs4 import BeautifulSoup

html_file = "Moncler x adidas Originals NMD Padded Boots _ Harrods US.html"
embedding_server = "http://localhost:3001"
generation_server = "http://localhost:3000"
qdrant_server = "http://localhost:6333"

with open(html_file, "r") as f:
    html = f.read()
    
    
def chunk_html(html: str):
  soup = BeautifulSoup(html, 'lxml')
  meta = soup.find_all("meta")
  scripts = soup.find_all("script")

  json_content = [tag.get_text() for tag in scripts if tag.get("type") and 'json' in tag['type']]

  text = soup.get_text(separator='\n', strip=True)

  return json_content + meta + [text]


In [15]:
import requests

chunks = chunk_html(html)

embeddings = []

for chunk in chunks:
  payload = {
    "inputs": str(chunk),
    "normalize": True,
    "truncate": False
  }
  response = requests.post(f"{embedding_server}/embed", json=payload)
  embeddings.append(response.json())

In [35]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue

client = QdrantClient(url=qdrant_server)

client.recreate_collection(collection_name="harrods", vectors_config=VectorParams(size=1024, distance=Distance.DOT))


True

In [36]:
from uuid import uuid4

points = [
  PointStruct(
    id=str(uuid4()),
    vector=embedding[0],
    payload={"content": str(chunks[i]), "page_id": html_file, "chunk_id": i}
  ) for i, embedding in enumerate(embeddings)
]

client.upsert(collection_name="harrods", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [40]:
query = "price of item"
query_embedding = requests.post(f"{embedding_server}/embed", json={"inputs": query, "normalize": True, "truncate": False}).json()[0]

search_result = client.search(
    collection_name="harrods",
    query_vector=query_embedding,
    query_filter=Filter(
      must=[
        FieldCondition(
          key="page_id",
          match=MatchValue(value=html_file)
        )
      ]
    ),
    limit=5
)

for result in search_result:
  print(result.payload["content"])

<meta content="USD" data-react-helmet="true" property="product:price:currency"/>
<meta content="out of stock" data-react-helmet="true" property="product:availability"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
{"@context":"http://schema.org/","@type":"Product","name":"Moncler x adidas Originals NMD Padded Boots  | Harrods US ","description":"Moncler black x adidas Originals NMD Padded Boots . Earn Rewards points when you shop and gain access to exclusive benefits.","image":"https://image.harrods.com/moncler-x-adidas-originals-nmd-padded-boots_21980262_47415528_1000.jpg","sku":null,"productID":"21980262","color":"Black","category":"Ankle Boots","brand":{"@type":"Brand","name":"Moncler"},"offers":{"@context":"http://schema.org/","@type":"Offer","url":"https://www.harrods.com/en-us/shopping/moncler-x-adidas-originals-nmd-padded-boots-21980262","priceCurrency":"USD","itemCondition":"http://schema.org/NewCondition","availability":"http://schema.org/OutOfStock","seller":{"@type