In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore
import boto3
import os
from azure.storage.blob import BlobServiceClient
from tqdm import tqdm
import json
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.retrievers import BM25Retriever
from langchain.prompts import ChatPromptTemplate
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnableParallel
import numpy as np
from dotenv  import load_dotenv
load_dotenv()
import shutil

def load_file(file_name):
    loader=[]
    # print(file_name.split(".")[-1])
    if file_name.split('.')[-1] == "pptx":
        loader = UnstructuredPowerPointLoader(file_name).load()
    elif file_name.split('.')[-1] == "pdf":
        loader = PyPDFLoader(file_name).load()    
    elif file_name.split('.')[-1] == "docx":
        loader = Docx2txtLoader(file_name).load()
    elif file_name.split('.')[-1] == "html":
        loader = UnstructuredHTMLLoader(file_name).load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        # separator="\n\n",
        chunk_size=1000,
        chunk_overlap=300
        )
    pages = text_splitter.split_documents(loader)
    return pages


def file_to_chunks(folder):
    pages=[]
    for file_name in os.listdir(f"{folder}"):
        pages.extend(load_file(f"{folder}\\{file_name}"))
    if folder != "Local_data":
        shutil.rmtree(f"{folder}")
    return pages

def azure_data_download(AZURE_CONNECTION_STRING,CONTAINER_NAME):
    blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(CONTAINER_NAME)
    if not os.path.exists("Azure_data"):
        os.mkdir("Azure_data")
    for file_name in container_client.list_blobs():
        blob_client = container_client.get_blob_client(file_name)
        with open(f"Azure_data\\{file_name.name}", "wb") as file:
            data = blob_client.download_blob().readall()
            file.write(data)


def aws(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME,object_name):
        # Create an S3 client
    s3 = boto3.client('s3',
                    aws_access_key_id=AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

    # List objects in the bucket
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)

    if not os.path.exists("S3_data"):
        os.mkdir("S3_data")

    # Download files in the 'data' object
    for i in response.get('Contents',[]):
        if i['Key'].split('/')[-1] != "" and i['Key'].split('/')[0] == object_name:
            # print(i['Key'])
            file_path = os.path.join("S3_data", i['Key'].split('/')[-1])
            # print(file_path)
            s3.download_file(BUCKET_NAME, i['Key'], file_path)


def l(h):
    d=""
    for i in h:
        for _,j in i.items():
            d=d+f"{j} \n"
    if d == "":
        d = "No history found"
    return d


def generate_queries(query):

    # Multi Query: Different Perspectives
    template = """You are an AI language model assistant. Your task is to generate Four 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines. Original question: {question}"""
    prompt_perspectives = ChatPromptTemplate.from_template(template)


    generate_querie = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() 
        | (lambda x: x.split("\n"))
        | (lambda x: [query] + x)
    )
    return generate_querie 

def keyword_extractor():
    prompt="""
    You are an AI language model assistant. Your task is to help the user identify key terms in their query.

    Please list the main keywords you want to extract from your query.

    query: {query}
    """
    prompt_perspectives=ChatPromptTemplate.from_template(prompt)
    generate_querie = (
        prompt_perspectives 
        | ChatOpenAI(temperature=0) 
        | StrOutputParser() )
    return generate_querie

def _get(a):
    dd=[]
    for s in a:
        dd.extend(s)
    return dd

def get_unique_documents(doc_list):
    seen_content = set()
    unique_documents = []
    
    for doc in doc_list:
        content = doc.page_content
        if content not in seen_content:
            seen_content.add(content)
            unique_documents.append(doc)
    
    del seen_content
    
    return unique_documents

def meta(s):
    f=[]
    for i in s:
        d=""
        for i,y in i.metadata.items():
            d=d+f"{i} : {y} \n"
        f.append(d)
    return f

In [2]:
# pages = file_to_chunks("Local_data")

In [3]:
# import pickle
# with open("pages.pkl", "wb") as f:
#     # pickle.dump(pages, f)

In [4]:
import pickle

with open("pages.pkl", "rb") as f:
    pages1 = pickle.load(f)


In [5]:
# pages==pages1

True

In [6]:
from langchain_community.vectorstores import FAISS
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# db = FAISS.from_documents(pages, OpenAIEmbeddings())

In [7]:
# db.save_local("Local_vectorstore")

db=FAISS.load_local("Local_vectorstore",OpenAIEmbeddings(),allow_dangerous_deserialization=True)

In [8]:
Query="tell me more about shyam sundar's education"

In [9]:
generate_queries(Query).invoke(Query)

["tell me more about shyam sundar's education",
 '1. What are the educational qualifications of Shyam Sundar?',
 "2. Can you provide details about Shyam Sundar's academic background?",
 '3. What is known about the educational history of Shyam Sundar?',
 '4. Could you share information regarding the schooling and higher education of Shyam Sundar?']

In [10]:

faiss_retriever=db.as_retriever(search_kwargs={'k': 10})

Bm25_retriever = BM25Retriever.from_documents(pages)
Bm25_retriever.k = 10

key_chain=keyword_extractor() | Bm25_retriever
map_chain=generate_queries | faiss_retriever.map() | _get | get_unique_documents
ensemble_retriever = EnsembleRetriever(
retrievers=[map_chain, key_chain], weights=[0.5, 0.5]
)
model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
compressor = CrossEncoderReranker(model=model, top_n=4)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)

final_prompt="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:"""

final_prompt_perspectives=ChatPromptTemplate.from_template(final_prompt)
       
llm_chain3= ({"context": itemgetter("query") | compression_retriever,
            "question":itemgetter("query")}
            | 
            RunnableParallel({
                "response":  final_prompt_perspectives | ChatOpenAI(temperature=0) | StrOutputParser() ,
                "context": itemgetter("context")
            })
            )



In [11]:
s=llm_chain3.invoke({"query":Query})

Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Forbidden"}\')')
Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Forbidden"}\')')
Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Forbidden"}\')')
Failed to batch ingest runs: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Read timed out. (read timeout=90.001)"))
Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/ba

In [12]:
s

{'response': 'Shyam Sundar holds an M.Tech in Modelling and Simulation from the Defence Institute of Advanced Technology in Pune with a GPA of 7.95 obtained in May 2023. He also has a B.Tech in Chemical Engineering from the National Institute of Technology in Tiruchirappalli with a GPA of 7.65 earned in May 2021. Additionally, he has taken relevant coursework in Data Structures, Machine Learning, Deep Learning, and other technical skills.',
 'context': [Document(page_content='Shyam Sundar\n5,kavimani steet, Pankajam colony, 3rd cross street, Madurai, India - 625009\n♂phone+91-9080765574 /envel⌢pemailshyamsundar.2022@gmail.com /linkedinshyamsundar007 /githubShyam-Sundar-7♂laptopPortfolio\nProjects\nLLM-Powered Coupon Recommender /github/video |Python, Streamlit, Langchain, OpenAI November 2023\n•Developed a QA system for e-commerce with personalized coupon recommendations using OpenAI’s LLMs.\n•Streamlined user interactions through a Streamlit interface and Langchain for real-world scen

In [13]:
s["response"]

'Shyam Sundar holds an M.Tech in Modelling and Simulation from the Defence Institute of Advanced Technology in Pune with a GPA of 7.95 obtained in May 2023. He also has a B.Tech in Chemical Engineering from the National Institute of Technology in Tiruchirappalli with a GPA of 7.65 earned in May 2021. Additionally, he has taken relevant coursework in Data Structures, Machine Learning, Deep Learning, and other technical skills.'

In [14]:
meta(s["context"])

['source : Local_data\\resume.pdf \npage : 0 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 4 \n',
 'source : Local_data\\Natu Lauchande - Machine Learning Engineering with MLflow_ Manage the end-to-end machine learning life cycle with MLflow (2021, Packt Publishing) - libgen.li.pdf \npage : 3 \n',
 'source : Local_data\\resume.pdf \npage : 0 \n']