# Setup : Installing Libraries

In [None]:
!pip install pandas
!pip install langchain openai
!pip install --user "langchain-community==0.2.10"

In [None]:
!pip install faiss-cpu

In [None]:
!pip install rank_bm25

# Importing Libraries

In [None]:
from langchain_community.document_loaders import CSVLoader

# CSV Loader

In [None]:
filepath ="/content/amazon_electronics_clean.csv"

In [None]:
loader = CSVLoader(file_path=filepath, encoding="utf-8")
data = loader.load()

In [None]:
for d in data[:2]:
    print(d.page_content)


#Parse Each Document

Data list contains Document objects.
Each document’s page_content is a long text block with lines like key: value

We’ll split and map those lines into a dictionary.

In [None]:
def parse_products_info(doc):
  lines = doc.page_content.split("\n")
  product_data={}
  for line in lines:
    if ": " in line:
      key, value = line.split(": ",1)
      product_data[key.strip()] = value.strip()
  return product_data

# Extract Relevant Fields

##Field	Why it matters

product_title	Describes the item type (important for similarity).

description	Gives functional and design details (semantic meaning).

main_category	Helps narrow down similar product groups.

brand	Brand reputation often affects performance.

price	Numeric comparison for performance insights.

average_rating	Base measure for predicting performance.

text	Provides real customer sentiment and experience.

review_title	Adds summary sentiment.


##Optional columns:

details	Use selectively (e.g., “Material”, “Connectivity”). Good for technical matching.

rating_number	To weigh popularity (can help in numeric modeling later).

verified_purchase	Can improve credibility weighting.

##Drop for now
parent_asin	Just an ID, no semantic meaning.

store	Often same as brand.

brand_consolidated	Duplicate field.

In [None]:
def extract_relevant_fields(product_data):
  keys_to_keep = [
      "product_title",
        "brand",
        "main_category",
        "price",
        "description",
        "average_rating",
        "review_title",
        "text"
  ]
  return {k: product_data.get(k, "") for k in keys_to_keep}

#Format for Embedding

In [None]:
def format_for_embedding(product_data):
    return f"""
    Product Title: {product_data.get('product_title', '')}
    Brand: {product_data.get('brand', '')}
    Category: {product_data.get('main_category', '')}
    Price: {product_data.get('price', '')}
    Description: {product_data.get('description', '')}
    Average Rating: {product_data.get('average_rating', '')}
    Review Title: {product_data.get('review_title', '')}
    Review: {product_data.get('text', '')}
    """


#Preprocessing all the documents

In [None]:
from langchain.schema import Document

cleaned_documents = []
for doc in data:
    product_data = parse_products_info(doc)
    relevant_data = extract_relevant_fields(product_data)
    formatted_text = format_for_embedding(relevant_data).strip()
    #Create langChain Document with metadata
    cleaned_documents.append(
        Document(
            page_content=formatted_text,
            metadata={
                "brand": relevant_data.get("brand"),
                "category": relevant_data.get("main_category"),
                "price": relevant_data.get("price"),
                "rating": relevant_data.get("average_rating"),
                "verified_purchase": relevant_data.get("verified_purchase", False)
            }
        )
    )

#Preview Sample

In [None]:
print(cleaned_documents[0].page_content[:300])  # first 300 chars of first product
print(cleaned_documents[0].metadata)


page_content → what your RAG system searches and embeds for semantic similarity.

metadata → optional filters and context, not embedded, but very helpful for precise results.

#Embedding

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [None]:
#Initailize embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

#FAISS Setup

In [None]:
#Create FAISS vector
vectorstore = FAISS.from_documents(cleaned_documents, embeddings)

In [None]:
def faiss_search(query, top_k=3):
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    return results

#BM25 Setup

In [None]:

from rank_bm25 import BM25Okapi

In [None]:
#tokenize the cleaned documents for BM25
corpus = [doc.page_content for doc in cleaned_documents]
tokenized_corpus = [doc.split() for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [None]:
def bm25_search(query, top_k=3):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_indices = scores.argsort()[-top_k:][::-1]
    results = [(cleaned_documents[i], scores[i]) for i in top_indices]
    return results

#Hybrid Search

In [None]:
import numpy as np

def hybrid_search(query, top_k=3, alpha=0.5):
    # BM25 scores
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores) + 1e-6)

    # FAISS scores
    faiss_results = vectorstore.similarity_search_with_score(query, k=len(cleaned_documents))

    # Combine scores
    combined_scores = []
    for i, doc in enumerate(cleaned_documents):
        faiss_score = next((score for d, score in faiss_results if d == doc), 0)
        combined_score = alpha * bm25_norm[i] + (1 - alpha) * faiss_score
        combined_scores.append((doc, combined_score))

    # Sort top results
    combined_scores.sort(key=lambda x: x[1], reverse=True)
    return combined_scores[:top_k]


##Alpha controls weight:

alpha=0.5 → BM25 and FAISS equally weighted

alpha>0.5 → more weight on keyword matching

alpha<0.5 → more weight on semantic search

#Compare Outputs

In [None]:
query = "wireless printer with connectivity issues"
top_k = 3

print("=== BM25 Only ===")
for doc, score in bm25_search(query, top_k):
    print(doc.page_content[:300])
    print(doc.metadata)
    print("Score:", score)
    print("-"*80)

print("=== FAISS Only ===")
for doc, score in faiss_search(query, top_k):
    print(doc.page_content[:300])
    print(doc.metadata)
    print("Score:", score)
    print("-"*80)

# print("=== Hybrid ===")
# for doc, score in hybrid_search(query, top_k):
#     print(doc.page_content[:300])
#     print(doc.metadata)
#     print("Score:", score)
#     print("-"*80)


BM25-only → captures exact keyword matches; may miss semantically relevant products if phrasing differs.

FAISS-only → finds products semantically similar, including those with synonyms or descriptive reviews.

Hybrid → balances both; often retrieves the most relevant and precise results.

In [None]:
bm25_results = bm25_search(query, top_k)
faiss_results = faiss_search(query, top_k)
#hybrid_results = hybrid_search(query, top_k)


In [None]:
# Create a DataFrame for comparison
table_data = []

for i in range(top_k):
    table_data.append({
        "Rank": i+1,
        "BM25 Score": round(bm25_results[i][1], 3),
        "FAISS Score": round(faiss_results[i][1], 3),
        "Hybrid Score": round(hybrid_results[i][1], 3),
        "BM25 Product": bm25_results[i][0].page_content[:50] + "...",
        "FAISS Product": faiss_results[i][0].page_content[:50] + "...",
        "Hybrid Product": hybrid_results[i][0].page_content[:50] + "...",
        "BM25 Brand": bm25_results[i][0].metadata.get("brand"),
        "FAISS Brand": faiss_results[i][0].metadata.get("brand"),
        "Hybrid Brand": hybrid_results[i][0].metadata.get("brand")
    })

df = pd.DataFrame(table_data)
display(df)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

labels = [f"Rank {i+1}" for i in range(top_k)]
x = np.arange(top_k)
width = 0.25

bm25_scores = [row['BM25 Score'] for row in table_data]
faiss_scores = [row['FAISS Score'] for row in table_data]
hybrid_scores = [row['Hybrid Score'] for row in table_data]

fig, ax = plt.subplots(figsize=(10,6))

ax.bar(x - width, bm25_scores, width, label='BM25')
ax.bar(x, faiss_scores, width, label='FAISS')
ax.bar(x + width, hybrid_scores, width, label='Hybrid')

ax.set_ylabel('Score')
ax.set_title(f'Top-{top_k} Retrieval Comparison for Query: "{query}"')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()


#Integrating RAG pipeline with Language Model

In [None]:
# from langchain.chat_models import ChatOpenAI


#Initialize LM
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5) --Requires OpenAPI key

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
import torch

##model_name = "TheBloke/guanaco-7B-HF"  # small model, GPU recommended. --crashed
# model_name ="TheBloke/guanaco-1.3B-HF" --require authentication
#model_name = "tiiuae/falcon-7b" --crashed
# model_name = "mosaicml/mpt-3b-storywriter"  # 3B model
#model_name = "EleutherAI/gpt-neo-1.3B" --- issue hallucinating
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#                     model_name,
#                     device_map="auto",        # automatically uses GPU if available
#                     torch_dtype=torch.float16, # save VRAM
#                     trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)




In [None]:
local_pipeline = pipeline(
    "text2text-generation", # FLAN-T5 uses text2text
    model=model,
    tokenizer=tokenizer,
    max_length=500,  # only limits generated tokens, not input
    temperature=0.7,
    do_sample=False
)

llm = HuggingFacePipeline(pipeline=local_pipeline)

In [None]:
#Summarizer pipeline

summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    max_length=70,   # output summary length
    min_length=20,
    do_sample=False
)



In [None]:
#Create Prompt Template
from langchain.prompts import PromptTemplate

prompt_template =  """
You are a Product Insights Assistant.

Based on the following summarized past product data:

{retrieved_docs}

A new product has these details:
{query}

Provide **only** the insights in the format below. Do NOT include explanations or instructions:

1. Expected average rating (number)
2. Potential risks (brief list)
3. Factors influencing performance (brief list)
4. Specifications (brief list)
"""


prompt = PromptTemplate(
    input_variables=["retrieved_docs", "query"],
    template=prompt_template
)


In [None]:
def generate_product_insights_local(user_query, top_k=5, retriever=hybrid_search):
    # Retrieve top-k relevant documents
    top_docs = retriever(user_query, top_k=top_k)

    #Summarize each document to keep input concise

    summaries = []
    for doc, score in top_docs[:5]:
        summary = summarizer(doc.page_content, truncation=True)[0]['summary_text']
        print(summary)
        summaries.append(summary)

    retrieved_text = "\n".join(summaries)

    # retrieved_text = "\n\n".join([doc.page_content for doc, score in top_docs[:3]]) -- too long input

    # Format prompt
    formatted_prompt = prompt.format(retrieved_docs=retrieved_text, query=user_query)
    print("Output")

    # Generate insights
    return llm(formatted_prompt, max_new_tokens=200)


In [None]:
query = "New wireless printer for small business with low cost"
insights = generate_product_insights_local(query,5,bm25_search)
print(insights)


#Gradio Setup

In [None]:
def gradio_generate_insights(user_query, top_k, retriever_choice):
    # Choose retriever
    if retriever_choice == "BM25":
        retriever = bm25_search
    elif retriever_choice == "FAISS":
        retriever = faiss_search
    else:
        retriever = hybrid_search

    # Generate insights
    insights = generate_product_insights_local(user_query, top_k=top_k, retriever=retriever)
    return insights


In [None]:
import gradio as gr

iface = gr.Interface(
    fn=gradio_generate_insights,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter new product details here...", label="Product Query"),
        gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top-K Documents"),
        gr.Radio(choices=["BM25", "FAISS", "Hybrid"], value="Hybrid", label="Retriever")
    ],
    outputs=gr.Textbox(label="Product Insights"),
    title="Product Insights Assistant",
    description="Enter details of a new product and get expected ratings, risks, and performance insights using hybrid retrieval and local LLM."
)

iface.launch()
