<a href="https://colab.research.google.com/github/Omarkouta73/RAG-Applications-In-Web-Data-Extraction/blob/main/Text/RAG_SemanticChunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install faiss-cpu
# !pip install selenium
# !pip install google-generativeai
# !pip install -qU langchain-text-splitters
# !pip install langchain_experimental
#!pip install "unstructured[all-docs]"

In [None]:
import requests
from bs4 import BeautifulSoup
import faiss
import numpy as np
import torch
from transformers import pipeline
import google.generativeai as genai
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import HTMLSemanticPreservingSplitter
from unstructured.partition.html import partition_html
from langchain.embeddings import HuggingFaceEmbeddings
import nltk
from nltk.corpus import stopwords
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import random

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Step 1: Fetch HTML content
def get_cleaned_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove the specific div with id="customer-reviews_feature_div"
    customer_reviews_div = soup.find('div', id='customer-reviews_feature_div')
    if customer_reviews_div and hasattr(customer_reviews_div, 'decompose'):
        customer_reviews_div.decompose()

    # Remove unnecessary tags (scripts, styles, iframes, etc.)
    for tag in soup.find_all(['script', 'style', 'iframe', 'noscript', 'footer', 'header', 'a']):
        if tag and hasattr(tag, 'decompose'):
            tag.decompose()

    # Remove tags with ad-related keywords
    ad_keywords = ['ad', 'banner', 'promo', 'footer', 'sponsor', 'select', 'button']
    for tag in soup.find_all(ad_keywords):
        if tag and hasattr(tag, 'decompose'):
            tag.decompose()

    # Remove empty tags
    for tag in soup.find_all():
        if tag and not tag.get_text(strip=True):
            if hasattr(tag, 'decompose'):
                tag.decompose()

    return soup

def fetch_html(url):
    # List of user agents
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    ]

    # Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run headless
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(f"user-agent={random.choice(user_agents)}")

    # Set location to Egypt
    chrome_options.add_argument("--lang=ar-EG")
    chrome_options.add_experimental_option("prefs", {
        "intl.accept_languages": "ar-EG,ar",
        "profile.default_content_setting_values.geolocation": 1
    })

    # Add geolocation for Cairo, Egypt
    chrome_options.add_argument("--enable-geolocation")
    chrome_options.add_argument("--use-fake-ui-for-media-stream")
    chrome_options.add_argument("--use-fake-device-for-media-stream")
    chrome_options.add_experimental_option("prefs", {
        "profile.default_content_settings.geolocation": 1,
        "profile.managed_default_content_settings.geolocation": 1,
        "profile.default_content_setting_values.geolocation": 1
    })

    service = Service()  # Windows
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.execute_cdp_cmd("Emulation.setGeolocationOverride", {
        "latitude": 30.0444,
        "longitude": 31.2357,
        "accuracy": 100
    })

    time.sleep(random.uniform(5, 10))

    driver.get(url)

    return get_cleaned_text(driver.page_source)

def read_html(dir):
  with open(dir, "r", encoding="utf-8") as file:
        return file.read()

In [None]:
def langchain_SC(text, number_of_chunks=16, breakpoint_threshold_type="standard_deviation", model="all-MiniLM-L6-v2"):
  embedding_model = HuggingFaceEmbeddings(model_name=model)
  splitter = SemanticChunker(embedding_model, number_of_chunks=16, breakpoint_threshold_type="standard_deviation")
  chunks = splitter.create_documents([text])
  return chunks


def langchain_HSC(text):
  headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
  splitter = HTMLSemanticPreservingSplitter(
      headers_to_split_on=headers_to_split_on,
      max_chunk_size=500,
      elements_to_preserve=["table", "ul", "li", "ol", "p"],
      chunk_overlap=0,
      denylist_tags=["script", "style", "head"],
      normalize_text=True,
      preserve_images=True,
      preserve_links=True,
      stopword_removal=False,
  )
  chunks = splitter.split_text(text)
  return chunks

nltk.download('stopwords')
# Define a set of English stopwords
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
  # Split the text into words, filter out stopwords, and join them back together.
  return ' '.join(word for word in text.split() if word not in stop_words)

def unstructured_ch(text, chunking_strategy="by_title"):
    # Partition the HTML/text into chunks
    chunks = partition_html(text=text, chunking_strategy=chunking_strategy)
    # For each chunk, lower-case the text and remove stop words
    return [remove_stop_words(chunk.text.lower()) for chunk in chunks]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Step 3: Text Embedding
def embed_text(text, model):
    # Move model to GPU
    #model.to(device)

    # Get token embeddings
    token_embeddings = model(text, return_tensors="pt")[0].to(device)  # Move embeddings to GPU
    # Average the token embeddings to get a single vector
    chunk_embedding = token_embeddings.mean(dim=0).detach().cpu().numpy()  # Move back to CPU for FAISS
    return chunk_embedding

In [None]:
# Step 4: Query Embedding and Similarity Search
def search_similar_chunks(query, chunks, embeddings, model, top_k=3):
    query_embedding = embed_text(query, model)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return [chunks[i] for i in indices[0]]

In [None]:
# Step 5: LLM Processing with Gemini
def extract_info_with_llm(chunks, prompt):
    # Combine chunks into a single string
    combined_chunks = " ".join(chunks)

    # Initialize Gemini
    genai.configure(api_key="YOUR_GEMINI_API_KEY")  # Replace with your Gemini API key

    # Use the Gemini model
    model = genai.GenerativeModel('gemini-2.0-flash')  # Use the Gemini Pro model

    # Generate a response
    response = model.generate_content(f"""using these information: {combined_chunks} make your best to {prompt}
                                      , and provide it in ```json ``` format.
                                      give me 'NAN' if there is no direct answer in the provided information.""",
                                      generation_config={"max_output_tokens": 500})

    # Return the generated text
    return response.text

In [None]:
# Example Usage
url = "https://www.amazon.com/A315-24P-R7VH-Display-Quad-Core-Processor-Graphics/dp/B0BS4BP8FB/ref=sr_1_3?sr=8-3"
html = fetch_html(url)

In [None]:
#html = read_html("/content/page.html")

In [None]:
techniques = {
    "chunks_SC": langchain_SC(str(html)),
    "chunks_HSC": langchain_HSC(str(html)),
    "Unstructured": unstructured_ch(str(html))
}

In [None]:
# # Embed chunks
model1 = pipeline('feature-extraction', model='distilbert-base-uncased', padding=True, truncation=True, add_special_tokens = True)

# Option 1: Better pre-trained model
model2 = pipeline('feature-extraction',
                 model='sentence-transformers/all-MiniLM-L6-v2',
                 padding=True,
                 truncation=True,
                 add_special_tokens=True)

# Option 2: Another good alternative
# model3 = pipeline('feature-extraction',
#                  model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
#                  padding=True,
#                  truncation=True,
#                  add_special_tokens=True)

Device set to use cpu
Device set to use cpu


# Manual

In [None]:
#chunks = techniques['Text Splitter']; chunks #ch_con = [chunk.page_content for chunk in chunks];

In [None]:
#embeddings = np.array([embed_text(chunk, model) for chunk in chunks])

In [None]:
# Query
#query = "Extract product price"
#similar_chunks = search_similar_chunks(query, chunks, embeddings, top_k=3)

In [None]:
#similar_chunks

# Automatically

In [None]:
def prompt(query, model, k=3):
  prompt = query
  start_time = time.time()
  no_chunk_result = extract_info_with_llm(str(html), prompt)
  no_chunk_time = (time.time() - start_time) * 1000  # Convert to milliseconds
  print(f"No-Chunking: {no_chunk_result}")
  print(f"Time: {no_chunk_time:.2f} ms")
  print("==============================================================================")
  # Measure each chunking technique
  for key, chunks in techniques.items():
      start_time = time.time()

      if type(chunks[0]) != str:
          chunks = [chunk.page_content for chunk in chunks]
      print(key,":")
      print("Chunks: ", chunks)

      embeddings = np.array([embed_text(chunk, model) for chunk in chunks])
      print("Embeddings Shape: ", embeddings.shape)

      similar_chunks = search_similar_chunks(query, chunks, embeddings, model, top_k=k)
      print("Similar Chunks: ", similar_chunks)

      result = extract_info_with_llm(similar_chunks, prompt)
      execution_time = (time.time() - start_time) * 1000  # Convert to milliseconds

      print("Response:", result)
      print(f"Time: {execution_time:.2f} ms")
      print("==============================================================================")

## distilbert-base-uncased

In [None]:
prompt("Extract the amount of RAM memory installed", model1, k=3)

No-Chunking: ```json
{
  "ram_memory_installed": "8 GB"
}
```
Time: 4659.91 ms
chunks_SC :
Embeddings Shape:  (16, 768)
Similar Chunks:  ['This Aspire 3 can tackle any job no matter how complex. Creativity soars to new heights with sharp, crisp visuals showcased on the 15.6" Full HD IPS screen, while the thin bezel provides you with more space to work with wherever inspiration strikes. It’s the perfect companion to take with you wherever you go With Windows 11 Home in S mode, you’ll experience fast start-ups, a familiar yet expanded Start menu, and great new ways to get stuff done.', '</p> </div> </div><div class="a-section a-spacing-none premium-aplus-column">  <div class="a-section a-spacing-none column-heading"> <h1 class="aplus-h3 a-text-bold"> Get a Fresh Perspective </h1> </div> <div class="a-section a-spacing-none column-description"> <p class="aplus-p3"> Family, friends, obsessions, music, creations—Windows 11 Home in S mode is the one place for it all. With a fresh new feel an

In [None]:
prompt("Extract the laptop price", model1, k=3)

No-Chunking: ```json
{
  "laptop_price": "$299.99"
}
```
Time: 6057.44 ms
chunks_SC :
Embeddings Shape:  (16, 768)
Similar Chunks:  ['This Aspire 3 can tackle any job no matter how complex. Creativity soars to new heights with sharp, crisp visuals showcased on the 15.6" Full HD IPS screen, while the thin bezel provides you with more space to work with wherever inspiration strikes. It’s the perfect companion to take with you wherever you go With Windows 11 Home in S mode, you’ll experience fast start-ups, a familiar yet expanded Start menu, and great new ways to get stuff done.', '</p> </div> </div><div class="a-section a-spacing-none premium-aplus-column">  <div class="a-section a-spacing-none column-heading"> <h1 class="aplus-h3 a-text-bold"> Get a Fresh Perspective </h1> </div> <div class="a-section a-spacing-none column-description"> <p class="aplus-p3"> Family, friends, obsessions, music, creations—Windows 11 Home in S mode is the one place for it all. With a fresh new feel and too

In [None]:
prompt("Does the laptop has front-facing camera ?", model1, k=3)

No-Chunking: ```json
{
  "front_facing_camera": "yes"
}
```
Time: 4954.57 ms
chunks_SC :
Embeddings Shape:  (16, 768)
Similar Chunks:  ['This Aspire 3 can tackle any job no matter how complex. Creativity soars to new heights with sharp, crisp visuals showcased on the 15.6" Full HD IPS screen, while the thin bezel provides you with more space to work with wherever inspiration strikes. It’s the perfect companion to take with you wherever you go With Windows 11 Home in S mode, you’ll experience fast start-ups, a familiar yet expanded Start menu, and great new ways to get stuff done.', '</p> </div> </div><div class="a-section a-spacing-none premium-aplus-column">  <div class="a-section a-spacing-none column-heading"> <h1 class="aplus-h3 a-text-bold"> Get a Fresh Perspective </h1> </div> <div class="a-section a-spacing-none column-description"> <p class="aplus-p3"> Family, friends, obsessions, music, creations—Windows 11 Home in S mode is the one place for it all. With a fresh new feel and 

## all-MiniLM-L6-v2

In [None]:
prompt("Extract the amount of RAM memory installed", model2, k=3)

No-Chunking: ```json
{
  "ram_memory_installed": "8 GB"
}
```
Time: 4197.03 ms
chunks_SC :
Embeddings Shape:  (16, 384)
Similar Chunks:  ['<html class="a-js a-audio a-video a-canvas a-svg a-drag-drop a-geolocation a-history a-webworker a-autofocus a-input-placeholder a-textarea-placeholder a-local-storage a-gradients a-transform3d a-touch-scrolling a-text-shadow a-text-stroke a-box-shadow a-border-radius a-border-image a-opacity a-transform a-transition null" data-19ax5a9jf="dingo" data-aui-build-date="3.25.1-2025-03-08" lang="en-us"><!-- sp:feature:head-start --><!-- sp:end-feature:head-close -->\n<!-- sp:feature:start-body -->\n<body class="a-m-us a-aui_72554-c a-aui_a11y_6_837773-c a-aui_killswitch_csa_logger_372963-t1 a-aui_pci_risk_banner_210084-c a-aui_template_weblab_cache_333406-c a-aui_tnr_v2_180836-c a-bw_aui_cxc_alert_measurement_1074111-c a-meter-animate"><div id="a-page"><!-- sp:end-feature:start-body -->\n<!-- sp:feature:csm:body-open -->\n\n\n<!-- sp:end-feature:csm:body

In [None]:
prompt("Extract the laptop price", model2, k=3)

No-Chunking: ```json
{
  "laptop_price": "$299.99"
}
```
Time: 6167.17 ms
chunks_SC :
Embeddings Shape:  (16, 384)
Similar Chunks:  ['Any returned computer that is damaged through customer misuse, is missing parts, or is in unsellable condition due to customer tampering will result in the customer being charged a higher restocking fee based on the condition of the product. Amazon.com will not accept returns of any desktop or notebook computer more than 30 days after you receive the shipment. New, used, and refurbished products purchased from Marketplace vendors are subject to the returns policy of the individual vendor.</span> </div> <div class="a-section table-padding">    Product Warranty: For warranty information about this product, please <span class="a-color-secondary"> [PDF ] </span> </div> </div> </div>\n</div> <div class="a-row"> <div class="a-section"> <h1 class="a-size-medium a-spacing-small secHeader"> Feedback </h1> <div class="a-section table-padding"> <div class="a-row"> 

In [None]:
prompt("Does the laptop has front camera ?", model2, k=3)

No-Chunking: ```json
{
  "front_camera": "Yes"
}
```
Time: 4620.98 ms
chunks_SC :
Embeddings Shape:  (16, 384)
Similar Chunks:  ['This Aspire 3 can tackle any job no matter how complex. Creativity soars to new heights with sharp, crisp visuals showcased on the 15.6" Full HD IPS screen, while the thin bezel provides you with more space to work with wherever inspiration strikes. It’s the perfect companion to take with you wherever you go With Windows 11 Home in S mode, you’ll experience fast start-ups, a familiar yet expanded Start menu, and great new ways to get stuff done.', '</span></li> <li class="a-spacing-mini"><span class="a-list-item"> Visibly Stunning: Experience sharp details and crisp colors on the 15.6" Full HD IPS display with 16:9 aspect ratio and narrow bezels. </span></li> <li class="a-spacing-mini"><span class="a-list-item"> Internal Specifications: 8GB LPDDR5 Onboard Memory; 128GB NVMe solid-state drive storage to store your files and media  </span></li> <li class="a-sp