In [10]:
# Cell 1: Install required packages
!pip install -q langchain transformers sentence-transformers faiss-cpu pandas datasets langchain-community

In [11]:
# Cell 2: Import necessary libraries
import pandas as pd
import os
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Cell 3: Load and inspect the dataset
import pandas as pd

# Load the CSV
df = pd.read_csv('amazon.csv')

# Print all column names
print("Columns in your dataset:")
print(df.columns.tolist())

# show first few rows to understand structure
print("\nFirst 2 rows of data:")
print(df.head(2))

Columns in your dataset:
['product_id', 'product_name', 'category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link']

First 2 rows of data:
   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%    4.0       43,994   

                                       about_product  \
0  High Compatibility : Compatible With iPhone 12...

In [None]:
# Cell 4: Create rich document content from real columns
# Use 'about_product' as the main text 
df['combined_content'] = df.apply(
    lambda row: f"Product ID: {row['product_id']}\n"
                f"Name: {row['product_name']}\n"
                f"Category: {row['category']}\n"
                f"Discounted Price: ₹{row['discounted_price']}\n"
                f"Actual Price: ₹{row['actual_price']}\n"
                f"Discount: {row['discount_percentage']}%\n"
                f"Rating: {row['rating']} ⭐\n"
                f"Rating Count: {row['rating_count']} reviews\n"
                f"Description: {row['about_product']}\n"
                f"Link: {row['product_link']}",
    axis=1
)

# Keep only the combined content for documents
df_for_docs = df[['combined_content']].dropna()

# Convert to LangChain documents
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(df_for_docs, page_content_column="combined_content")
documents = loader.load()

print(f"Created {len(documents)} documents.")

Created 1465 documents.


In [None]:
# Cell 5: Split long documents into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=50
)
split_docs = text_splitter.split_documents(documents)

print(f"Split into {len(split_docs)} chunks.")

Split into 10142 chunks.


In [None]:
# Cell 6: Set up embedding model
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"}  # Change to "cuda" if GPU available
)

  embedding_model = HuggingFaceEmbeddings(


In [20]:
# Cell 7: Build FAISS vector database
from langchain.vectorstores import FAISS

vectorstore = FAISS.from_documents(split_docs, embedding_model)
vectorstore.save_local("faiss_index_amazon_full")  # Save for reuse

In [29]:
# Cell 8: Load lightweight LLM (FLAN-T5-Small)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline

# Path to your downloaded model (or just use the name if online)
model_name = "/home/robin/Projects/RAG+LLM-Product-recommendation/myvenv/bin/flan-t5-small/"  # or "google/flan-t5-small"

# Load tokenizer and model correctly
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)  # ✅ Use Seq2SeqLM, NOT CausalLM

# Create pipeline with correct task
pipe = pipeline(
    "text2text-generation",  # ✅ Task for T5 models
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150
)

# Wrap in LangChain
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [30]:
# Cell 7: Create Retrieval QA chain
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(k=3),
    chain_type="stuff"
)

In [31]:
# Cell 8: Ask a question
query = "What are the best-rated products in Electronics?"
response = qa_chain.run(query)
print("Question:", query)
print("Answer:", response)

  response = qa_chain.run(query)


Question: What are the best-rated products in Electronics?
Answer: Apple MacBook, Dell, HP, Lenovo, Asus, MSI, Acer, Microsoft Surface, Razer, Samsung etc offers multiple Protection against short-circuit, over-temperature, over-current, over-voltage and more|Robust textured casing and premium internal components ensure perfect performance|Power Source Type: Corded Electric; Specification Met: Bis
