In [36]:
import math

import requests
from bs4 import BeautifulSoup

In [37]:



url = 'https://www.daraz.com.np/'  
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}



In [38]:
def ParseHTML(url,headers):
    response = requests.get(url,headers= headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        return soup
        
    else:
        print('Failed to retrieve the page')
        return -1
        

In [39]:
def ParseHTMLString(stringData):
    soup = BeautifulSoup(stringData, 'html.parser')

    return soup



In [40]:
with open('Test.html', 'r') as file:
    # Read the content of the file into a string
    file_content = file.read()

In [41]:
soup = ParseHTMLString(file_content)

In [42]:
soup

ï»¿<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Sample E-commerce Site</title>
</head>
<body>
<header>
<div class="container">
<h1>My E-commerce Site</h1>
<ul>
<li><a href="#">Home</a></li>
<li><a href="#">Products</a></li>
<li><a href="#">About</a></li>
<li><a href="#">Contact</a></li>
</ul>
</div>
</header>
<div class="container">
<h2>Our Products</h2>
<div class="products">
<div class="product">
<img alt="Gel Pen" src="https://via.placeholder.com/150"/>
<h3>Gel Pen</h3>
<p class="price">$2.99</p>
<p class="description">Smooth writing gel pen with a comfortable grip and vibrant ink colors. Perfect for everyday use at school, home, or office.</p>
</div>
<div class="product">
<img alt="Notebook" src="https://via.placeholder.com/150"/>
<h3>Notebook</h3>
<p class="price">$5.99</p>
<p class="description">Durable and stylish notebook with lined pages. Ideal for note-taking, journaling, or ske

In [43]:
def SplitDocumentsByTags(soup):
    chunks = []

    for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6','a']):
        heading_text = heading.get_text()
        sibling = heading.find_next_sibling()
        content = []
     
        content.append(f"{heading_text}")
       # content.append("heading.name " "+heading_text+" ")
        while sibling and sibling.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            
            content.append(f"{sibling}")
            sibling = sibling.find_next_sibling()

        chunks.append((heading_text, " ".join(content)))
    return chunks

In [44]:
SplitDocumentsByTags(soup)

[('My E-commerce Site',
  'My E-commerce Site <ul>\n<li><a href="#">Home</a></li>\n<li><a href="#">Products</a></li>\n<li><a href="#">About</a></li>\n<li><a href="#">Contact</a></li>\n</ul>'),
 ('Home', 'Home'),
 ('Products', 'Products'),
 ('About', 'About'),
 ('Contact', 'Contact'),
 ('Our Products',
  'Our Products <div class="products">\n<div class="product">\n<img alt="Gel Pen" src="https://via.placeholder.com/150"/>\n<h3>Gel Pen</h3>\n<p class="price">$2.99</p>\n<p class="description">Smooth writing gel pen with a comfortable grip and vibrant ink colors. Perfect for everyday use at school, home, or office.</p>\n</div>\n<div class="product">\n<img alt="Notebook" src="https://via.placeholder.com/150"/>\n<h3>Notebook</h3>\n<p class="price">$5.99</p>\n<p class="description">Durable and stylish notebook with lined pages. Ideal for note-taking, journaling, or sketching. Available in various colors and sizes.</p>\n</div>\n<div class="product">\n<img alt="Wireless Mouse" src="https://via.

In [45]:
from transformers import BertTokenizer, BertModel
import torch

In [46]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [47]:
def aggregate_embeddings(chunk_embeddings, method='mean'):
    if method == 'mean':
        document_embedding = torch.mean(chunk_embeddings, axis=1)
    elif method == 'sum':
        document_embedding = torch.sum(chunk_embeddings, axis=1)
    else:
        raise ValueError("Unsupported aggregation method")
    return document_embedding

In [48]:
class EmbeddingString:
    def __init__(self, text, embedding):
        self.text= text
        self.embedding = embedding

In [49]:
embedding_document_mapping = {}


In [50]:
def GetEmbeddings(chunks):
    
    embeddings = []
    #print(chunks)
    for heading, text in chunks:
        #print(text)
        if text == "":
            continue
        inputs = tokenizer(text, return_tensors='pt', padding = 'max_length', truncation=True, max_length=170)
       # print(inputs['input_ids'].shape)
       # print(inputs)
        #print(inputs)
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            embedding =  aggregate_embeddings(last_hidden_states,'mean')
          #  print(last_hidden_states.shape)
            embdString  = EmbeddingString(text,embedding.squeeze().numpy())
            embeddings.append(embdString)
            #embedding_document_mapping[text] = embeddings
    return embeddings


In [51]:
def createEmbeddingForSingleDocument(document):
    inputs = tokenizer(document, return_tensors='pt', padding = 'max_length', truncation=True, max_length=170)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state
        embedding = aggregate_embeddings(last_hidden_states,'mean')
        embedding = embedding.squeeze().numpy()
        
    return EmbeddingString(document, embedding)

In [52]:

chunks = SplitDocumentsByTags(soup)
embeddings = GetEmbeddings(chunks)

In [53]:
import numpy as np

In [54]:
# generate planes 
dim = embeddings[0].embedding.shape[0]
planes = 10
buckets = 2**planes
random_planes_matrix = np.random.normal(size=(planes, dim))
random_planes_matrix


array([[-2.02892776,  0.22278353, -1.99673914, ...,  0.09226371,
        -0.33240089, -0.1852678 ],
       [-1.08179137,  0.40230482, -0.47073182, ...,  1.61335855,
        -0.8152127 ,  0.97805341],
       [ 0.62695472,  0.26476308,  1.18691933, ...,  0.3875985 ,
         0.51829685,  1.52017222],
       ...,
       [ 2.66943648,  0.19885396, -0.69445736, ..., -0.00344338,
        -0.57379568, -0.27860989],
       [-0.60017211, -0.52377117, -1.51855132, ..., -0.19784391,
        -0.59166688, -1.09852489],
       [ 0.39544425, -0.17412466,  1.58139057, ..., -0.36775727,
         0.22088353,  0.87280061]])

In [55]:
def side_of_plane_matrx(P, v):
   # print(P.shape)
    #print(v.shape)
    dotproduct = np.dot(P,v.T)
    sign = np.sign(dotproduct)
    return sign

In [56]:
def hash_table(P_l,v ):
    hash_value = 0
    for i, P in enumerate(P_l):
        sign = side_of_plane_matrx(P, v)
        hash_i = 1 if sign >= 0 else 0
        #print(hash_i)
        hash_value += 2**i * hash_i
        
    return hash_value
        

In [57]:
def createEmbeddingSpace(embeddings, random_planes):
    embeddings_space={n : [] for n in range(0,buckets)}
    
    
    for e in embeddings:
        embeddings_space[hash_table(random_planes,e.embedding)].append(e)
        
    return embeddings_space


In [58]:
embeddings_space = createEmbeddingSpace(embeddings, random_planes_matrix)

In [59]:
#len(embeddings_space[16])

In [60]:
def cosineSimilarity(a,b):
    a_mag = np.linalg.norm(a)
    b_mag = np.linalg.norm(b)
    #print(a.shape)
    #print(b.shape)
    dot_p = np.dot(a,b)
    cosB = dot_p/(a_mag*b_mag)
    #print(cosB)
    return cosB
    
    

In [61]:
#KNN
def KNN(query, embeddings_space, random_planes_matrix):
    queryEmbd = createEmbeddingForSingleDocument(query)
    v = hash_table(random_planes_matrix,queryEmbd.embedding)

    
    possible_embeddings = embeddings_space[v]
   # print(possible_embeddings)
    if len(possible_embeddings) == 0:
        print("Sorry couldnt find the item")
        return -1
    neareast_embd = possible_embeddings[0]
    for p in possible_embeddings:
        current = cosineSimilarity(p.embedding,queryEmbd.embedding)
        prev = cosineSimilarity(neareast_embd.embedding,queryEmbd.embedding)
        if current < prev:
            neareast_embd = p
        
    return neareast_embd.text
        
    
    
    
    

In [62]:
def SimmilarText(query,embeddings, random_planes_matrix,thres=0.75):
    queryEmbd = createEmbeddingForSingleDocument(query)
    v = hash_table(random_planes_matrix,queryEmbd.embedding)


    possible_embeddings = embeddings
    if len(possible_embeddings) == 0:
        print("Sorry couldnt find the item")
        return -1
   # neareast_embd = possible_embeddings[0]
    close_embd =[]
    for p in possible_embeddings:
        current = cosineSimilarity(p.embedding,queryEmbd.embedding)
        if current > thres:
            close_embd.append(p.text)
        #prev = cosineSimilarity(neareast_embd.embedding,queryEmbd.embedding)
        # if current < prev:
        #     neareast_embd = p

    return close_embd



In [63]:
simmilar_text = SimmilarText("something <p class=\"price\">$12.39</p> <p class=\"description\">This item is very good and of various sizes.</p>",embeddings,random_planes_matrix,0.8)

In [64]:
def Group(query,embeddings_space, random_planes_matrix):
    queryEmbd = createEmbeddingForSingleDocument(query)
    v = hash_table(random_planes_matrix,queryEmbd.embedding)


    possible_embeddings = embeddings_space[v]
   
    x = [i.text for i in possible_embeddings]
    return x

In [65]:
data = Group("something <p class=\"price\">$81.99</p> <p class=\"description\">Nice product. Features a leak-proof cap and is perfect for staying hydrated on the go.</p>",embeddings_space,random_planes_matrix)
data

[]

In [66]:
import re

In [67]:
def extract_product_info(text):
    # Define regex patterns for product name, price, and description
    soup = BeautifulSoup(text, 'html.parser')

    # Extract the name (text before the first tag)
    name = soup.get_text(strip=True, separator=" ").split("$")[0].strip()
    
    # Extract the price
    price = soup.find('p').get_text(strip=True)
    
    # Extract the description
    description = soup.find('p').get_text(strip=True)

    return {
        'name': name,
        'price': price,
        'description': description
    }

In [68]:
extract_product_info( 'Wireless Mouse <p class="price">$14.99</p> <p class="description">Ergonomic wireless mouse with high precision and long battery life. Compatible with Windows and MacOS. Perfect for work and gaming.</p>')

{'name': 'Wireless Mouse', 'price': '$14.99', 'description': '$14.99'}

In [69]:
products_info=[]
for i in simmilar_text:
    products_info.append(extract_product_info(i))

In [70]:
products_info

[{'name': 'Gel Pen', 'price': '$2.99', 'description': '$2.99'},
 {'name': 'Notebook', 'price': '$5.99', 'description': '$5.99'},
 {'name': 'Wireless Mouse', 'price': '$14.99', 'description': '$14.99'},
 {'name': 'Bluetooth Speaker', 'price': '$29.99', 'description': '$29.99'},
 {'name': 'Coffee Mug', 'price': '$9.99', 'description': '$9.99'},
 {'name': 'Smartphone Case', 'price': '$12.99', 'description': '$12.99'},
 {'name': 'Water Bottle', 'price': '$8.99', 'description': '$8.99'},
 {'name': 'Backpack', 'price': '$49.99', 'description': '$49.99'},
 {'name': 'Desk Lamp', 'price': '$24.99', 'description': '$24.99'},
 {'name': 'Headphones', 'price': '$59.99', 'description': '$59.99'},
 {'name': 'Fitness Tracker', 'price': '$39.99', 'description': '$39.99'}]