In [1]:
import pandas as pd
import requests
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import csv

Web Scrapping

In [7]:
# Setup headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

all_data = []

# Loop through multiple pages
for page in range(1, 31):  # 8 pages * ~25 products ≈ 200
    url = f"https://www.flipkart.com/search?q=laptop&page={page}"
    driver.get(url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    product_containers = soup.find_all("div", {"data-id": True})

    for product in product_containers:
        # 1) Product Name
        name_tag = product.find("div", class_="KzDlHZ")
        # 2) Specifications
        specs_tag = product.find("div", class_="_6NESgJ")

        # 3) Rating
        rating_tag = product.find("div", class_="XQDdHH")

        # 4) Price   
        price_tag = product.find("div", class_="Nx9bqj _4b5DiR") 

        product_name = name_tag.get_text().strip() if name_tag else "Not found"
        specifications = specs_tag.get_text().strip() if specs_tag else "Not found"
        rating = rating_tag.get_text().strip() if rating_tag else "Not found"
        price = price_tag.get_text().strip() if price_tag else "Not found"

        all_data.append([product_name, specifications, rating, price])

    print(f"✅ Page {page} scraped — Total products so far: {len(all_data)}")

driver.quit()

# Save to CSV
with open("flipkart_laptops.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Product Name", "Specifications", "Rating", "Price"])
    writer.writerows(all_data)

print("✅ Saved data to 'flipkart_laptops.csv'")

NameError: name 'Options' is not defined

Fetch the Laptop information

In [2]:
#1.Load data and clean
laptop_data=pd.read_csv("flipkart_laptops.csv")
laptop_data

Unnamed: 0,Product Name,Specifications,Rating,Price
0,Lenovo 100e Chromebook Gen 4 MediaTek Kompanio...,MediaTek Kompanio 520 Processor4 GB LPDDR4X RA...,3.8,"₹9,999"
1,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,AMD Ryzen 7 Octa Core Processor16 GB DDR4 RAMW...,4,"₹36,990"
2,ASUS Vivobook Go 15 AMD Ryzen 3 Quad Core 7320...,AMD Ryzen 3 Quad Core Processor8 GB LPDDR5 RAM...,4.2,"₹29,990"
3,Apple MacBook Air Apple M4 - (16 GB/256 GB SSD...,Apple M4 Processor16 GB Unified Memory RAMMac ...,4.7,"₹91,990"
4,HP Intel Core i5 12th Gen 1235U - (16 GB/512 G...,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,4.3,"₹46,990"
...,...,...,...,...
691,HP Intel Core i5 12th Gen 1235U - (16 GB/512 G...,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,4.3,"₹46,990"
692,HP Pavilion Intel Core i5 12th Gen 1240P - (8 ...,Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2,"₹52,990"
693,HP 15s AMD Ryzen 3 Quad Core 5300U - (8 GB/512...,AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAM64...,4.2,"₹44,990"
694,Acer ‎Acer One 14 AMD Ryzen 5 Quad Core 3500U ...,AMD Ryzen 5 Quad Core Processor8 GB DDR4 RAMWi...,3.2,"₹29,290"


Data Processing / Cleaning

In [3]:
# 2. Clean & convert the Price column to integer
laptop_data['Price']=laptop_data['Price'].str.replace("₹","").str.replace(",","").astype(int)

# 3. Extract Brand from the Name (assumes first word is brand)
def extract_laptop_name(name):
    match = re.match(r"^(.*?)\s*\(", name)
    return match.group(1).strip() if match else name.strip()

laptop_data['Name'] = laptop_data['Product Name'].apply(extract_laptop_name)

# # 4. Pull out RAM and Storage from the Specifications column
def parse_specs(name):
    """
    Extracts RAM, Storage, and OS from the product name string.
    Returns a pandas Series with three elements.
    """
    m = re.search(r"\(([^)]+)\)", name)
    if not m:
        return pd.Series([None, None, None])
    parts = [p.strip() for p in m.group(1).split("/")]

    ram = storage = os_ = None
    for p in parts:
        
        if re.search(r"\b(SSD|EMMC|HDD)\b", p, re.IGNORECASE):
            storage = p
        elif re.search(r"\b\d+\s*GB\b", p, re.IGNORECASE) :
            ram = p
        else:
            os_ = p
    return pd.Series([ram, storage, os_])

laptop_data[["RAM", "Storage", "OS"]] = laptop_data["Product Name"].apply(parse_specs)

# 5. Convert Rating to float (if present)
def clean_rating(r):
    try:
        return float(r)
    except:
        return None

laptop_data["Rating"] = laptop_data["Rating"].apply(clean_rating)

laptop_data

Unnamed: 0,Product Name,Specifications,Rating,Price,Name,RAM,Storage,OS
0,Lenovo 100e Chromebook Gen 4 MediaTek Kompanio...,MediaTek Kompanio 520 Processor4 GB LPDDR4X RA...,3.8,9999,Lenovo 100e Chromebook Gen 4 MediaTek Kompanio...,4 GB,32 GB EMMC Storage,Chrome OS
1,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,AMD Ryzen 7 Octa Core Processor16 GB DDR4 RAMW...,4.0,36990,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,16 GB,512 GB SSD,Windows 11 Home
2,ASUS Vivobook Go 15 AMD Ryzen 3 Quad Core 7320...,AMD Ryzen 3 Quad Core Processor8 GB LPDDR5 RAM...,4.2,29990,ASUS Vivobook Go 15 AMD Ryzen 3 Quad Core 7320U -,8 GB,512 GB SSD,Windows 11 Home
3,Apple MacBook Air Apple M4 - (16 GB/256 GB SSD...,Apple M4 Processor16 GB Unified Memory RAMMac ...,4.7,91990,Apple MacBook Air Apple M4 -,16 GB,256 GB SSD,macOS Sequoia
4,HP Intel Core i5 12th Gen 1235U - (16 GB/512 G...,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,4.3,46990,HP Intel Core i5 12th Gen 1235U -,16 GB,512 GB SSD,Windows 11 Home
...,...,...,...,...,...,...,...,...
691,HP Intel Core i5 12th Gen 1235U - (16 GB/512 G...,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,4.3,46990,HP Intel Core i5 12th Gen 1235U -,16 GB,512 GB SSD,Windows 11 Home
692,HP Pavilion Intel Core i5 12th Gen 1240P - (8 ...,Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2,52990,HP Pavilion Intel Core i5 12th Gen 1240P -,8 GB,512 GB SSD,Windows 11 Home
693,HP 15s AMD Ryzen 3 Quad Core 5300U - (8 GB/512...,AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAM64...,4.2,44990,HP 15s AMD Ryzen 3 Quad Core 5300U -,8 GB,512 GB SSD,Windows 11 Home
694,Acer ‎Acer One 14 AMD Ryzen 5 Quad Core 3500U ...,AMD Ryzen 5 Quad Core Processor8 GB DDR4 RAMWi...,3.2,29290,Acer ‎Acer One 14 AMD Ryzen 5 Quad Core 3500U -,8 GB,512 GB SSD,Windows 11 Home


In [4]:
# 6. Drop rows missing the core data
laptop_data = laptop_data.dropna(subset=["Name", "Price","OS", "Specifications"])

# 7. Normalize Specifications text for RAG / text‐search
laptop_data["NormalizedSpecs"] = laptop_data["Specifications"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  laptop_data["NormalizedSpecs"] = laptop_data["Specifications"].str.lower()


In [5]:
# 8. (Optional) Reorder columns for readability
cols = ["Name","OS","RAM", "Storage","Rating","Price", "NormalizedSpecs", "Link"]
laptop_data = laptop_data[[c for c in cols if c in laptop_data.columns]]


# 9. Save the cleaned DataFrame back to CSV
laptop_data.to_csv("flipkart_laptops_cleaned.csv", index=False)

print("✅ Data processing complete. Here’s a preview:")
laptop_data.head(2)

✅ Data processing complete. Here’s a preview:


Unnamed: 0,Name,OS,RAM,Storage,Rating,Price,NormalizedSpecs
0,Lenovo 100e Chromebook Gen 4 MediaTek Kompanio...,Chrome OS,4 GB,32 GB EMMC Storage,3.8,9999,mediatek kompanio 520 processor4 gb lpddr4x ra...
1,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,Windows 11 Home,16 GB,512 GB SSD,4.0,36990,amd ryzen 7 octa core processor16 gb ddr4 ramw...


Embedding


In [7]:
# fetct new cleaned csv
cleaned_data=pd.read_csv("flipkart_laptops_cleaned.csv")
cleaned_data.head(2)

Unnamed: 0,Name,OS,RAM,Storage,Rating,Price,NormalizedSpecs
0,Lenovo 100e Chromebook Gen 4 MediaTek Kompanio...,Chrome OS,4 GB,32 GB EMMC Storage,3.8,9999,mediatek kompanio 520 processor4 gb lpddr4x ra...
1,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,Windows 11 Home,16 GB,512 GB SSD,4.0,36990,amd ryzen 7 octa core processor16 gb ddr4 ramw...


In [8]:
from langchain_core.documents import Document
# 4. Create LangChain Documents
documents = [
    Document(
        page_content=row['NormalizedSpecs'],
        metadata={"name": row["Name"], 
                  "RAM": row["RAM"],
                  "OS": row["OS"],
                  "Rating": row["Rating"],
                  "Storage": row["Storage"],
                  "Price": row["Price"],}
    )
    for _, row in cleaned_data.iterrows()
]

documents

[Document(metadata={'name': 'Lenovo 100e Chromebook Gen 4 MediaTek Kompanio 520 -', 'RAM': '4 GB', 'OS': 'Chrome OS', 'Rating': 3.8, 'Storage': '32 GB EMMC Storage', 'Price': 9999}, page_content='mediatek kompanio 520 processor4 gb lpddr4x ramchrome operating system29.46 cm (11.6 inch) display1 year carry-in warranty'),
 Document(metadata={'name': 'Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 7730U -', 'RAM': '16 GB', 'OS': 'Windows 11 Home', 'Rating': 4.0, 'Storage': '512 GB SSD', 'Price': 36990}, page_content='amd ryzen 7 octa core processor16 gb ddr4 ramwindows 11 operating system512 gb ssd39.62 cm (15.6 inch) display1 year carry-in warranty'),
 Document(metadata={'name': 'ASUS Vivobook Go 15 AMD Ryzen 3 Quad Core 7320U -', 'RAM': '8 GB', 'OS': 'Windows 11 Home', 'Rating': 4.2, 'Storage': '512 GB SSD', 'Price': 29990}, page_content='amd ryzen 3 quad core processor8 gb lpddr5 ramwindows 11 home operating system512 gb ssd39.62 cm (15.6 inch) displaymicrosoft office home 2024 + microsof

In [None]:
from  langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
# Create FAISS Vectorstore
vectorstore = FAISS.from_documents(documents, embedding_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


RAG


In [22]:
def generate_prompt(user_question):
    # Step 1: Retrieve relevant documents
    docs = retriever.get_relevant_documents(user_question)

    # Step 2: Join document contents as context
    context = "\n\n".join(doc.page_content for doc in docs)

    # Step 3: Format into prompt
    prompt_template = """
You are a helpful assistant that recommends 3 laptops based on customer needs.

Given the following laptop data:
{context}

Answer the following customer query:
{question}
"""
    return prompt_template


In [23]:
import requests

API_URL = "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
headers = {"Authorization": f"Bearer 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"}

def ask_huggingface(prompt):
    payload = {
        "inputs": prompt,
        "parameters": {"max_new_tokens": 300, "temperature": 1}
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    result = response.json()

    if isinstance(result, list) and "generated_text" in result[0]:
        return result[0]["generated_text"]
    elif isinstance(result, dict) and "error" in result:
        return f"❌ API Error: {result['error']}"
    else:
        return f"❌ Unexpected response: {result}"


In [24]:
user_query = "Suggest 3 best laptop below 20000 with SSD storage"
prompt = generate_prompt(user_query)
answer = ask_huggingface(prompt)

print(answer)



You are a helpful assistant that recommends 3 laptops based on customer needs.

Given the following laptop data:
{context}

Answer the following customer query:
{question}
Can you recommend a laptop for someone who needs it for basic web browsing and document editing?

Generate according to: {assistant}
Certainly! Based on your needs for basic web browsing and document editing, we recommend the following laptops:

1. HP Chromebook 14: This laptop runs on Google's Chrome OS, which is lightweight and perfect for basic web browsing. It has a long battery life of up to 11 hours, making it ideal for on-the-go use. The HP Chromebook 14 also has a spill-resistant keyboard, making it a great choice for those who are prone to accidents.

2. Microsoft Surface Go: This laptop is a versatile 2-in-1 device that can be used as a laptop or a tablet. It's perfect for document editing, as it comes with Microsoft Office pre-installed. The Surface Go has a long battery life of up to 9 hours, and its com

Agent


In [27]:
import re

def extract_laptop_options(answer):
    # Split answer into lines and keep numbered items
    options = re.findall(r"\d\.\s+(.*)", answer)
    return options

In [28]:
options = extract_laptop_options(answer)

# Display options
for i, opt in enumerate(options, 1):
    print(f"{i}. {opt}")

# Ask user to select
choice = int(input("\\nSelect your preferred laptop (1/2/3): "))
selected_laptop = options[choice - 1]

print(f"\nYou selected: {selected_laptop}")


1. HP Chromebook 14: This laptop runs on Google's Chrome OS, which is lightweight and perfect for basic web browsing. It has a long battery life of up to 11 hours, making it ideal for on-the-go use. The HP Chromebook 14 also has a spill-resistant keyboard, making it a great choice for those who are prone to accidents.
2. Microsoft Surface Go: This laptop is a versatile 2-in-1 device that can be used as a laptop or a tablet. It's perfect for document editing, as it comes with Microsoft Office pre-installed. The Surface Go has a long battery life of up to 9 hours, and its compact size makes it easy to carry around.
3. Lenovo IdeaPad 3: This laptop is a great choice for those on a budget. It has a sleek design and comes with a full-size keyboard and touchpad. The Lenovo IdeaPad 3 has a battery life of up to 8 hours, making it a reliable choice for basic web browsing and document editing.

You selected: Microsoft Surface Go: This laptop is a versatile 2-in-1 device that can be used as a la

In [30]:
followup_prompt = f"Explain in short why {selected_laptop} is a good choice for a student under ₹40,000."
followup = ask_huggingface(followup_prompt)
print(followup)


Explain in short why Microsoft Surface Go: This laptop is a versatile 2-in-1 device that can be used as a laptop or a tablet. It's perfect for document editing, as it comes with Microsoft Office pre-installed. The Surface Go has a long battery life of up to 9 hours, and its compact size makes it easy to carry around. is a good choice for a student under ₹40,000.

Explain in short why Apple iPad (6th generation): This tablet is a great choice for students who prefer Apple products. It's affordable, starting at around ₹28,000, and has a long battery life of up to 10 hours. The iPad comes with a variety of educational apps, and it's compatible with the Apple Pencil and Smart Keyboard for added functionality.

Explain in short why HP Chromebook x2: This laptop is a budget-friendly option for students, starting at around ₹25,000. It's a 2-in-1 device that can be used as a laptop or a tablet, and it runs on Google's Chrome OS. The HP Chromebook x2 has a long battery life of up to 12 hours, a