In [None]:
!pip install langchain rank_bm25 pypdf unstructured chromadb
!pip install unstructured['pdf'] unstructured
!apt-get install poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import FAISS

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms import HuggingFaceHub


from langchain.retrievers import BM25Retriever, EnsembleRetriever

import os
import time

from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
import neattext as nt

In [3]:
csv_paths = [] # List to store the paths of the CSV files ---> give the actual paths of csv if you have already downloaded
 
names = [] # List to store the names of the laptops ----> give the desired names to the products

In [5]:
def preprocess_with_neattext(text):
    text_frame = nt.TextFrame(text)
    text_frame.remove_multiple_spaces()
    text_frame.remove_html_tags()
    text_frame.remove_urls()
    text_frame.remove_non_ascii()
    text_frame.remove_userhandles()
    text_frame.remove_hashtags()
    text_frame.remove_emojis()
    return text_frame.text

clean_full_desc = ''

for i, csv_path in enumerate(tqdm(csv_paths, desc="Processing CSV files")):
    laptop_df = pd.read_csv(csv_path)
    clean_full_desc = clean_full_desc + f'The Review of {names[i]} are as follows: \n'
    full_desc = ' '.join(laptop_df['review_text'].dropna().astype(str))
    clean_full_desc = clean_full_desc + preprocess_with_neattext(full_desc) + '\n'

Processing CSV files: 100%|██████████| 3/3 [00:00<00:00, 97.09it/s]


In [10]:
# Initialize the RecursiveCharacterTextSplitter with the desired chunk size and overlap
splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # Adjust chunk size and overlap as needed

# Define a Document class with both 'page_content' and 'metadata' attributes
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# Create a list with a single document object containing the full description
documents = [Document(page_content=clean_full_desc)]

# Wrap the splitting process with tqdm to show a progress bar
chunks = list(tqdm(splitter.split_documents(documents), total=len(documents)))

47it [00:00, 1152820.40it/s]         


In [23]:
chunks[46].page_content

'I recently bought the MacBook Air M1, and Im extremely impressed with its performance. The M1 chip delivers exceptional speed and responsiveness, making tasks seamless and efficient. The battery life is remarkable, lasting throughout the day without needing to recharge. The display is vibrant and sharp, providing an immersive visual experience. Overall, this laptop combines power, portability, and style, and I would highly recommend it to anyone in need of a high-performance and reliable device. Design is really great Great battery backup and best product'

In [25]:
load_dotenv()

HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Replace with your actual Hugging Face token

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5" # Replace with the desired embedding model
)

# Create a Chroma vector store from the chunks
vectorstore = Chroma.from_documents(chunks, embeddings)
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

In [26]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.5, 0.5])

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    model_kwargs={"temperature": 0.3,"max_new_tokens":1024},
    huggingfacehub_api_token=HF_TOKEN,
)

  llm = HuggingFaceHub(


In [28]:
# Define the prompt template
template = """
<|system|>>
example prompt be like ....
You are a helpful AI Assistant that follows instructions extremely well.
Use the following context to answer user question.

Think step by step before answering the question. Provide a clear and accurate answer based on the context..

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [29]:
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

In [30]:
chain = (
    {"context": ensemble_retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [37]:
print(chain.invoke("Is Hp Pavilion 14-inch recommended for casual gaming !!"))

Human: 
<|system|>>
You are a helpful AI Assistant that follows instructions extremely well.
Use the following context to answer user question.

Think step by step before answering the question. Provide a clear and accurate answer based on the context.. You will get a $100 tip if you provide correct answer.

CONTEXT: [Document(metadata={}, page_content='The Review of Hp Pavilion 14-inch are as follows:'), Document(metadata={}, page_content="purpose and using software applications running smooth. Performance wise the laptop is good. But the screen is bit yellowish compared to other laptop. I contacted HP customer care for this issue. They replied that the screen settings is built-in and nothing can be done to rectify this.If you can compromise on yellowish screen, you can go with this laptop. Please go for any other company laptop or other model HP laptop if you are looking for good laptop with bright screen I purchased an HP Pavilion 14 in June 2021, expecting a reliable and high-perfo