In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from bs4 import BeautifulSoup
import requests
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from uuid import uuid4

html_file = "Moncler x adidas Originals NMD Padded Boots _ Harrods US.mhtml"
embedding_server = "http://localhost:3001"
generation_server = "http://localhost:3000"
qdrant_server = "http://localhost:6333"

client = QdrantClient(url=qdrant_server)

client.recreate_collection(collection_name="harrods", vectors_config=VectorParams(size=1024, distance=Distance.DOT))


True

In [11]:
import email
from email import policy
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def extract_html_from_mhtml(file_path):
    with open(file_path, 'rb') as file:
        # Parse the .mhtml file as a MIME message
        message = email.message_from_binary_file(file, policy=policy.default)

        # Iterate through the message parts
        for part in message.walk():
            # Check if the part is an HTML document
            if part.get_content_type() == 'text/html':
                # Return the HTML content
                return part.get_content()

html = extract_html_from_mhtml(html_file)

def chunk_text(text, max_tokens=512):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Chunk the tokens
    chunks = []
    current_chunk = []
    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks
    
def chunk_html(html: str):
  soup = BeautifulSoup(html, 'lxml')

  meta = soup.find_all("meta")
  scripts = soup.find_all("script")

  json_content = [tag.get_text() for tag in scripts if tag.get("type") and 'json' in tag['type']]

  text = soup.get_text(separator='\n', strip=True)

  return json_content + meta + chunk_text(text, max_tokens=256)

chunks = chunk_html(html)

[nltk_data] Downloading package punkt to /home/shawn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def get_embedding(text: str):
  payload = {
    "inputs": str(text),
    "normalize": True,
    "truncate": False
  }
  response = requests.post(f"{embedding_server}/embed", json=payload)
  return response.json()
 
embeddings = list(map(get_embedding, chunks))

In [14]:
def index_for_search(collection_name:str, chunks, embeddings):
  points = [ 
    {
      "id": str(uuid4()),
      "vector": embedding[0],
      "payload": {"content": str(chunks[i]), "page_id": html_file, "chunk_id": i}
    } for i, embedding in enumerate(embeddings)
  ]
  
  client.upsert(collection_name=collection_name, points=points)

index_for_search("harrods", chunks, embeddings)

In [15]:
def search(query: str, page_id: str, limit: int = 5):
  query_embedding = requests.post(f"{embedding_server}/embed", json={"inputs": query, "normalize": True, "truncate": False}).json()[0]

  search_result = client.search(
      collection_name="harrods",
      query_vector=query_embedding,
      query_filter=Filter(
        must=[
          FieldCondition(
            key="page_id",
            match=MatchValue(value=page_id)
          )
        ]
      ),
      limit=limit
  )
  
  return search_result

In [104]:
from jinja2 import Environment, FileSystemLoader, select_autoescape
import json
env = Environment(
    loader=FileSystemLoader("templates"),
    autoescape=select_autoescape()
)

template = env.get_template('chat.jinja')

def get_extraction_prompt(prompt: str, context: str):
  chat = [
    {
      "role": "system",
      "content": "extract relevant product details from web snippets. Always answer with ONLY a JSON object that has ONLY a single key. Never include any other information in your answer."
    },
    {
      "role": "user",
      "content": prompt + "\n\n" + context
    }
  ]
  
  prompt = template.render(messages=chat, add_generation_prompt=True, eos_token="<\s>")
  
  return prompt

def get_normalize_prompt(prompt: str):
  chat = [
    {
      "role": "system",
      "content": "Rewrite the content as prose. Include all of the details present."
    },
    {
      "role": "user",
      "content": prompt
    }
  ]
  
  prompt = template.render(messages=chat, add_generation_prompt=True, eos_token="<\s>")
  
  return prompt


def generate(prompt: str, generate_params: dict = {}):
  payload = {
    "inputs": prompt,
    "parameters": generate_params
  }
  response = requests.post(f"{generation_server}/generate", json=payload).json()
  if not "generated_text" in response:
    print(response)
  return response["generated_text"]


def ask_question(question: str, page_id: str, limit: int = 5):
  search_results = search(question, page_id, limit=limit)
  if len(search_results) == 0:
    return "Sorry, I don't know the answer to that question."
  searched_content=[search_result.payload["content"] for search_result in search_results]
  answer = "\n".join(searched_content)
  prompt = get_extraction_prompt(question, answer)
  answer = generate(prompt, generate_params={"best_of": 1, "stop": ["}"], "temperature": 0.1, "max_new_tokens": 256})
  answer = answer.strip()
  try:
    answer = json.loads(answer)
    if len(answer.keys()) == 1:
      return list(answer.values())[0]
    elif len(answer.keys()) > 1:
      prompt = get_normalize_prompt(json.dumps(answer))
      answer = generate(prompt, generate_params={"best_of": 1, "temperature": 0.2, "max_new_tokens": 256})
      return answer.strip()
  except Exception as e:
    prompt = get_normalize_prompt(answer)
    answer = generate(prompt, generate_params={"best_of": 1, "temperature": 0.3, "max_new_tokens": 256})
    return answer.strip()
  return answer

In [108]:
def interrogate_page(page_id: str):
  name = ask_question("What is the name of the product?", page_id)
  price = ask_question(f"What is the price of {name}", page_id)
  currency = ask_question(f"What currency is the price of {name} ({price})", page_id)
  description = ask_question(f"Official description of {name}.", page_id, limit=7)
  is_available_online = ask_question(f"Is {name} available to purchase online? Answer only yes or no.", page_id)
  manufacturer = ask_question(f"Who is the manufacturer of {name}?", page_id)
  
  return {
    "name": name,
    "manufacturer": manufacturer,
    "price": price,
    "currency": currency,
    "description": description,
    "is_available_online": is_available_online,
  }
  
interrogate_page(html_file)

{'name': 'Moncler 999 x adidas Originals NMD Padded Boots',
 'manufacturer': 'Moncler x adidas Originals',
 'price': 637,
 'currency': 'USD',
 'description': "The collaboration between Moncler and adidas Originals has resulted in a groundbreaking partnership that blurs the lines between two iconic brands. The Moncler x adidas Originals NMD Padded Boots are a testament to this union, as they seamlessly combine the distinctive elements of both labels. The boots feature the instantly recognizable NMD sneaker silhouette from adidas, which is then expertly fused with Moncler's signature padded finish. This unique blend of styles is a testament to the meticulous attention to detail that has gone into this collaboration.\n\nUnfortunately, these boots are only available in-store, but they are definitely worth checking out. The price tag is set at $637, but customers can also earn rewards points when they make a purchase, which unlock exclusive benefits. This collaboration is a true testament t