In [209]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore
import boto3
import os
from azure.storage.blob import BlobServiceClient
from tqdm import tqdm
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.retrievers import BM25Retriever
from langchain.prompts import ChatPromptTemplate
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnableParallel
import numpy as np
from dotenv  import load_dotenv
load_dotenv()
import shutil

def load_file(file_name):
    loader=[]
    print(file_name.split(".")[-1])
    if file_name.split('.')[-1] == "pptx":
        loader = UnstructuredPowerPointLoader(file_name).load()
    elif file_name.split('.')[-1] == "pdf":
        loader = PyPDFLoader(file_name).load()    
    elif file_name.split('.')[-1] == "docx":
        loader = Docx2txtLoader(file_name).load()
    elif file_name.split('.')[-1] == "html":
        loader = UnstructuredHTMLLoader(file_name).load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        # separator="\n\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False
        )
    pages = text_splitter.split_documents(loader)
    return pages


def file_to_chunks(folder):
    pages=[]
    for file_name in os.listdir(f"{folder}"):
        pages.extend(load_file(f"{folder}\\{file_name}"))
    if folder != "Local_data":
        shutil.rmtree(f"{folder}")
    return pages

def azure_data_download(AZURE_CONNECTION_STRING,CONTAINER_NAME):
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(CONTAINER_NAME)
    if not os.path.exists("Azure_data"):
        os.mkdir("Azure_data")
    for file_name in container_client.list_blobs():
        blob_client = container_client.get_blob_client(file_name)
        with open(f"Azure_data\\{file_name.name}", "wb") as file:
            data = blob_client.download_blob().readall()
            file.write(data)


def aws(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME,object_name):
        # Create an S3 client
    s3 = boto3.client('s3',
                    aws_access_key_id=AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # List objects in the bucket
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)

    if not os.path.exists("S3_data"):
        os.mkdir("S3_data")

    # Download files in the 'data' object
    for i in response.get('Contents',[]):
        if i['Key'].split('/')[-1] != "" and i['Key'].split('/')[0] == object_name:
            # print(i['Key'])
            file_path = os.path.join("S3_data", i['Key'].split('/')[-1])
            # print(file_path)
            s3.download_file(BUCKET_NAME, i['Key'], file_path)


def generate_queries(query):

    # Multi Query: Different Perspectives
    template = """You are an AI language model assistant. Your task is to generate Four 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines. Original question: {question}"""
    prompt_perspectives = ChatPromptTemplate.from_template(template)


    generate_querie = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() 
        | (lambda x: x.split("\n"))
        | (lambda x: [query] + x)
    )
    return generate_querie 


def keyword_extractor():
    prompt="""
    You are an AI language model assistant. Your task is to help the user identify key terms in their query.

    Please list the main keywords you want to extract from your query.

    query: {query}
    """
    prompt_perspectives=ChatPromptTemplate.from_template(prompt)
    generate_querie = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() )
    return generate_querie

def _get(a):
    dd=[]
    for s in a:
        dd.extend(s)
    return dd

def get_unique_documents(doc_list):
    seen_content = set()
    unique_documents = []
    
    for doc in doc_list:
        content = doc.page_content
        if content not in seen_content:
            seen_content.add(content)
            unique_documents.append(doc)
    
    del seen_content
    
    return unique_documents

In [210]:
pages = file_to_chunks("Local_data")

pptx
pdf
pdf
docx
docx
html
docx


In [211]:
pages

[Document(page_content='FREE TEST DATA\n\nPPT FILE\n\n\n\nFREE TEST DATA\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Donec iaculis elit nec ante eleifend, eget cursus tortor auctor. Integer at ultrices lorem, eget bibendum turpis. Praesent lacus sapien, ullamcorper.\n\nFTD\n\nFREE TEST DATA\n\n2\n\n\n\nFREE TEST DATA\n\nNullam nisl ante, pellentesque eu iaculis sit amet, scelerisque et leo. Nunc fringilla tempus odio nec posuere. Vivamus blandit dignissim ante, quis sollicitudin lorem hendrerit id. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nunc enim arcu, bibendum non blandit auctor, tempus et mi. Vestibulum ligula nisi, feugiat sit amet posuere eget, convallis interdum risus. Donec non blandit dui. In tempor viverra metus ut mollis. Cras cursus, velit eu iaculis pellentesque, enim lacus imperdiet nisl, sit amet mattis mi mi ornare magna. Vestibulum cursus ex at urna volutpat, et eleifend nisi cursus. Cras at iaculis risus.\n\nFTD\n\nFREE TEST DATA\n

In [212]:
resume=load_file("Local_data\\resume.pdf")

pdf


In [213]:
resume

[Document(page_content='Shyam Sundar\n5,kavimani steet, Pankajam colony, 3rd cross street, Madurai, India - 625009\n♂phone+91-9080765574 /envel⌢pemailshyamsundar.2022@gmail.com /linkedinshyamsundar007 /githubShyam-Sundar-7♂laptopPortfolio\nProjects\nLLM-Powered Coupon Recommender /github/video |Python, Streamlit, Langchain, OpenAI November 2023\n•Developed a QA system for e-commerce with personalized coupon recommendations using OpenAI’s LLMs.\n•Streamlined user interactions through a Streamlit interface and Langchain for real-world scenario simulations.\n•Incorporated FAISS for refined recommendation processes.\nPeopleCare Insurance Prediction /github |Python, Jupyter, Azure Cloud, Flask, Docker October 2023\n•Expanded PeopleCare into vehicle insurance with a predictive model for effective customer targeting.\n•Thorough analysis of customer behavior and data cleaning for accurate predictive modeling.\n•Achieved 80% prediction accuracy using LightGBM.', metadata={'source': 'Local_data\

In [214]:
from langchain_community.vectorstores import FAISS
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
db = FAISS.from_documents(pages, OpenAIEmbeddings())

In [215]:
db.save_local("Local_vectorstore")

db=FAISS.load_local("Local_vectorstore",OpenAIEmbeddings(),allow_dangerous_deserialization=True)

In [216]:
Query="tell me more about the shyam sundar's project"

In [217]:

faiss_retriever=db.as_retriever(search_kwargs={'k': 10})

Bm25_retriever = BM25Retriever.from_documents(pages)
Bm25_retriever.k = 10


In [218]:
generate_queries(Query).invoke({"question":Query})

["tell me more about the shyam sundar's project",
 '1. What additional information can you provide about the project led by Shyam Sundar?',
 '2. Can you elaborate further on the project associated with Shyam Sundar?',
 "3. Could you share more details about Shyam Sundar's project?",
 '4. What else can you tell me about the project that involves Shyam Sundar?']

In [219]:
map_chain=generate_queries | faiss_retriever.map() | _get | get_unique_documents

In [220]:
key_chain=keyword_extractor() | Bm25_retriever

In [221]:
ensemble_retriever = EnsembleRetriever(
retrievers=[map_chain, key_chain], weights=[0.5, 0.5]
)

In [222]:
model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
compressor = CrossEncoderReranker(model=model, top_n=4)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)

final_prompt="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:"""

final_prompt_perspectives=ChatPromptTemplate.from_template(final_prompt)



In [223]:
       
llm_chain3= ({"context": itemgetter("query") | compression_retriever,
            "question":itemgetter("query")}
            | 
            RunnableParallel({
                "response":  final_prompt_perspectives | ChatOpenAI(temperature=0) | StrOutputParser() ,
                "context": itemgetter("context")
            })
            )

In [224]:
llm_chain3

{
  context: RunnableLambda(itemgetter('query'))
           | ContextualCompressionRetriever(base_compressor=CrossEncoderReranker(model=HuggingFaceCrossEncoder(client=<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x000002101A7F38F0>, model_name='cross-encoder/ms-marco-MiniLM-L-6-v2', model_kwargs={}), top_n=4), base_retriever=EnsembleRetriever(retrievers=[RunnableLambda(generate_queries)
             | RunnableEach(bound=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021017604530>, search_kwargs={'k': 10}))
             | RunnableLambda(_get)
             | RunnableLambda(get_unique_documents), ChatPromptTemplate(input_variables=['query'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['query'], template='\n    You are an AI language model assistant. Your task is to help the user identify key terms in their query.\n\n    Please list the main keywor

In [225]:
s=llm_chain3.invoke({"query":Query})

In [226]:
s

{'response': "Shyam Sundar's project includes a LLM-Powered Coupon Recommender and PeopleCare Insurance Prediction. The LLM-Powered Coupon Recommender project involved developing a QA system for e-commerce with personalized coupon recommendations using OpenAI’s LLMs. The PeopleCare Insurance Prediction project expanded PeopleCare into vehicle insurance with a predictive model for effective customer targeting.",
 'context': [Document(page_content='Shyam Sundar\n5,kavimani steet, Pankajam colony, 3rd cross street, Madurai, India - 625009\n♂phone+91-9080765574 /envel⌢pemailshyamsundar.2022@gmail.com /linkedinshyamsundar007 /githubShyam-Sundar-7♂laptopPortfolio\nProjects\nLLM-Powered Coupon Recommender /github/video |Python, Streamlit, Langchain, OpenAI November 2023\n•Developed a QA system for e-commerce with personalized coupon recommendations using OpenAI’s LLMs.\n•Streamlined user interactions through a Streamlit interface and Langchain for real-world scenario simulations.\n•Incorporat

In [227]:
s["response"]

"Shyam Sundar's project includes a LLM-Powered Coupon Recommender and PeopleCare Insurance Prediction. The LLM-Powered Coupon Recommender project involved developing a QA system for e-commerce with personalized coupon recommendations using OpenAI’s LLMs. The PeopleCare Insurance Prediction project expanded PeopleCare into vehicle insurance with a predictive model for effective customer targeting."

In [228]:
def meta(s):
    f=[]
    for i in s:
        d=""
        for i,y in i.metadata.items():
            d=d+f"{i} : {y} \n"
        f.append(d)
    return f

In [229]:
meta(s["context"])

['source : Local_data\\resume.pdf \npage : 0 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 4 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 43 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 125 \n']

In [230]:
for i in s["context"]:
    print(i.metadata)

{'source': 'Local_data\\resume.pdf', 'page': 0}
{'source': 'Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf', 'page': 4}
{'source': 'Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf', 'page': 43}
{'source': 'Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf', 'page': 125}


In [231]:
d=""
for i,y in s["context"][0].metadata.items():
    d=d+f"{i} : {y}\n"
print(d)

source : Local_data\resume.pdf
page : 0



In [232]:
type(meta(s["context"]))

list

In [233]:
s["context"][0].metadata["page"]

0

In [234]:
meta(s["context"])

['source : Local_data\\resume.pdf \npage : 0 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 4 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 43 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 125 \n']