In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import boto3
import os
from azure.storage.blob import BlobServiceClient
from tqdm import tqdm
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnableParallel
import numpy as np
from dotenv  import load_dotenv
load_dotenv()
import shutil

def load_file(file_name):
    loader=[]
    # print(file_name.split(".")[-1])
    if file_name.split('.')[-1] == "pptx":
        loader = UnstructuredPowerPointLoader(file_name).load()
    elif file_name.split('.')[-1] == "pdf":
        loader = PyPDFLoader(file_name).load()    
    elif file_name.split('.')[-1] == "docx":
        loader = Docx2txtLoader(file_name).load()
    elif file_name.split('.')[-1] == "html":
        loader = UnstructuredHTMLLoader(file_name).load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=300)
    pages = text_splitter.split_documents(loader)
    return pages


def file_to_chunks(folder):
    pages=[]
    for file_name in os.listdir(f"{folder}"):
        pages.extend(load_file(f"{folder}\\{file_name}"))
    if folder != "Local_data":
        shutil.rmtree(f"{folder}")
    return pages

def azure_data_download(AZURE_CONNECTION_STRING,CONTAINER_NAME):
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(CONTAINER_NAME)
    if not os.path.exists("Azure_data"):
        os.mkdir("Azure_data")
    for file_name in container_client.list_blobs():
        blob_client = container_client.get_blob_client(file_name)
        with open(f"Azure_data\\{file_name.name}", "wb") as file:
            data = blob_client.download_blob().readall()
            file.write(data)


def aws(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME,object_name):
        # Create an S3 client
    s3 = boto3.client('s3',
                    aws_access_key_id=AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # List objects in the bucket
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)

    if not os.path.exists("S3_data"):
        os.mkdir("S3_data")

    # Download files in the 'data' object
    for i in response.get('Contents',[]):
        if i['Key'].split('/')[-1] != "" and i['Key'].split('/')[0] == object_name:
            # print(i['Key'])
            file_path = os.path.join("S3_data", i['Key'].split('/')[-1])
            # print(file_path)
            s3.download_file(BUCKET_NAME, i['Key'], file_path)

def generate_queries_with_history():
    import json
    with open("history.json", "rb") as f:
        h_1 = json.load(f)
    h_1=formating_history(h_1)

    prompt = """You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.
        Additionally, look for the relevant information from the history to provide more contextually accurate variations. Provide these alternative questions separated by newlines.
        
        history : """+ h_1
    template=prompt+"""
        Original question: {query}
        """
    # print(template)
    # print(h_1)
    prompt_perspectives = ChatPromptTemplate.from_template(template)
    print(prompt_perspectives)
    generate_querie = (
        prompt_perspectives
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() 
        | (lambda x: x.split("\n"))
        # | (lambda x: [query] + x)
    )
    return generate_querie 

def _get(a):
    dd=[]
    for s in a:
        dd.extend(s)
    return dd

def get_unique_documents(doc_list):
    seen_content = set()
    unique_documents = []
    
    for doc in doc_list:
        content = doc.page_content
        if content not in seen_content:
            seen_content.add(content)
            unique_documents.append(doc)
    
    del seen_content
    
    return unique_documents


def keyword_extractor_with_history():
    import json
    with open("history.json", "rb") as f:
        h_1 = json.load(f)
    h_1=formating_history(h_1)
    prompt="""
    You are an AI language model assistant. Your task is to help the user identify key terms in their query.
    Please list the main keywords you want to extract from your query.

    Additionally, look for the relevant information from the history to provide more contextually accurate variations.
    history : """ + h_1

    tem=prompt +"""
    query: {query}
    """
    prompt_perspectives=ChatPromptTemplate.from_template(tem)
    print(prompt_perspectives)
    generate_querie = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() )
    return generate_querie

def main(Query,chunks,db):
    
    faiss_retriever=db.as_retriever(search_kwargs={'k': 10})

    Bm25_retriever = BM25Retriever.from_documents(chunks)
    Bm25_retriever.k = 10

    map_chain=generate_queries_with_history | faiss_retriever.map() | _get | get_unique_documents
    key_chain=keyword_extractor_with_history() | Bm25_retriever | get_unique_documents

    ensemble_retriever = EnsembleRetriever(
    retrievers=[map_chain, key_chain], weights=[0.5, 0.5]
    )

    model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
    compressor = CrossEncoderReranker(model=model, top_n=4)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=ensemble_retriever
    )

    final_prompt="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
        Question: {question} 
        Context: {context} 
        Answer:"""

    final_prompt_perspectives=ChatPromptTemplate.from_template(final_prompt)

    llm_chain= ({"context": itemgetter("query") | compression_retriever,
            "question":itemgetter("query")}
            | 
            RunnableParallel({
                "response":  final_prompt_perspectives | ChatOpenAI(temperature=0) | StrOutputParser() ,
                "context": itemgetter("context")
            })
            )
    
    return llm_chain.invoke({"query":Query})

def formating_history(h):
    d=""
    for i in h:
        for _,j in i.items():
            d=d+f"{j} \n"
    if d == "":
        d = "No history found"
    return d



In [3]:
import json
with open("history.json", "rb") as f:
    h_1 = json.load(f)
h_1=formating_history([])

In [5]:
keyword_extractor_with_history().invoke({"query":"Tell me about yourself"})

input_variables=['query'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['query'], template='\n    You are an AI language model assistant. Your task is to help the user identify key terms in their query.\n    Please list the main keywords you want to extract from your query.\n\n    Additionally, look for the relevant information from the history to provide more contextually accurate variations.\n    history : No history found\n    query: {query}\n    '))]


'Keywords:\n1. Tell\n2. About\n3. Yourself'

In [6]:
s=generate_queries_with_history().invoke({"query":"What is the capital of India?"})
# .invoke({"query":"What is the capital of India?","history":h_1})

input_variables=['query'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['query'], template='You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.\n        Additionally, look for the relevant information from the history to provide more contextually accurate variations. Provide these alternative questions separated by newlines.\n        \n        history : No history found\n        Original question: {query}\n        '))]


In [7]:
s

['1. Can you tell me the capital city of India?',
 '2. Which city serves as the capital of India?',
 '3. What is the official capital of India?',
 '4. Do you know the name of the capital city in India?',
 '5. Could you provide information on the capital of India?']

In [None]:
k=keyword_extractor_with_history().invoke({"query":"What is the capital of India?","history":h_1})

In [None]:
k

'Keywords:\n1. Capital\n2. India'

In [None]:
import pickle
from langchain_community.vectorstores import FAISS
with open("pages.pkl", "rb") as f:
    chunks = pickle.load(f)


db=FAISS.load_local("Local_vectorstore",OpenAIEmbeddings(),allow_dangerous_deserialization=True)

In [None]:

faiss_retriever=db.as_retriever(search_kwargs={'k': 10})

Bm25_retriever = BM25Retriever.from_documents(chunks)
Bm25_retriever.k = 10

map_chain=generate_queries_with_history() | faiss_retriever.map() | _get | get_unique_documents
key_chain=keyword_extractor_with_history() | Bm25_retriever | get_unique_documents

ensemble_retriever = EnsembleRetriever(
retrievers=[map_chain, key_chain], weights=[0.5, 0.5]
)

model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
compressor = CrossEncoderReranker(model=model, top_n=4)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)

final_prompt="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:"""

final_prompt_perspectives=ChatPromptTemplate.from_template(final_prompt)




input_variables=['history', 'query'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['history', 'query'], template='You are an AI language model assistant. Your task is to generate 5 different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search.\n        Additionally, look for the relevant information from the history to provide more contextually accurate variations. Provide these alternative questions separated by newlines.\n        \n        history : {history}\n        Original question: {query}\n        '))]




In [None]:
m=ensemble_retriever.invoke({"query":"What is the capital of India?","history":h_1})

In [None]:
len(m)

25



ValidationError: 1 validation error for ContextualCompressionRetriever
base_retriever
  Can't instantiate abstract class BaseRetriever without an implementation for abstract method '_get_relevant_documents' (type=type_error)

In [None]:
m={"query":itemgetter("query") , "history":itemgetter("history")} | ensemble_retriever

model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
compressor = CrossEncoderReranker(model=model, top_n=4)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=m
)

ValidationError: 1 validation error for ContextualCompressionRetriever
base_retriever
  Can't instantiate abstract class BaseRetriever without an implementation for abstract method '_get_relevant_documents' (type=type_error)

In [None]:
m.invoke({"query":"What is the capital of India?","history":h_1})

[Document(page_content="About the reviewer\nHitesh Hinduja is an ardent AI enthusiast working as a Senior Manager in AI at Ola \nElectric, where he leads a team of 20+ people in the areas of machine learning, deep \nlearning, statistics, computer vision, natural language processing, and reinforcement \nlearning. He has filed 14+ patents in India and the US and has numerous research \npublications under his name. Hitesh has been associated in research roles at India's \ntop B-schools: Indian School of Business, Hyderabad, and the Indian Institute of \nManagement, Ahmedabad. He is also actively involved in training and mentoring and has \nbeen invited as a guest speaker by various corporates and associations across the globe.", metadata={'source': 'Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf', 'page': 4}),
 Document(page_content='1. Install evidently :\npip in

In [None]:
llm_chain= ({"context": {"query":itemgetter("query") , "history":itemgetter("history")} | ensemble_retriever,
        "question":itemgetter("query")}
        | 
        RunnableParallel({
            "response":  final_prompt_perspectives | ChatOpenAI(temperature=0) | StrOutputParser() ,
            "context": itemgetter("context")
        })
        )

In [None]:
llm_chain.invoke({"query":"What is the capital of India?","history":h_1})

{'response': "I don't know.",
 'context': [Document(page_content="About the reviewer\nHitesh Hinduja is an ardent AI enthusiast working as a Senior Manager in AI at Ola \nElectric, where he leads a team of 20+ people in the areas of machine learning, deep \nlearning, statistics, computer vision, natural language processing, and reinforcement \nlearning. He has filed 14+ patents in India and the US and has numerous research \npublications under his name. Hitesh has been associated in research roles at India's \ntop B-schools: Indian School of Business, Hyderabad, and the Indian Institute of \nManagement, Ahmedabad. He is also actively involved in training and mentoring and has \nbeen invited as a guest speaker by various corporates and associations across the globe.", metadata={'source': 'Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf', 'page': 4}),
  Document(p