In [17]:
## Converting docx into the txt format

import docx

# Open the Word document
docx_file_path = 'Satthiyaraj_Resume.docx'
doc = docx.Document(docx_file_path)

# Extract text from the document
text = ""
for paragraph in doc.paragraphs:
    text += paragraph.text + "\n"

# Save the extracted text to a text file
txt_file_path = 'output.txt'
with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
    txt_file.write(text)

In [None]:
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install langchain
!pip install python-docx
!pip install chromadb

In [19]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:··········


In [29]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


full_text = open("output.txt", "r").read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts = text_splitter.split_text(full_text)

embeddings = OpenAIEmbeddings()
database = Chroma.from_texts(texts, embeddings)
retriever = database.as_retriever()

In [50]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


template = """First analyze the document properly and Answer the question in the steps based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [58]:
chain.invoke("give me the details of best paper award")

'Based on the given context, the details of the best paper award are as follows:\n\n- The best paper award was received in the 2nd National Conference on Innovative & Emerging Trends in Engineering and Technology.\n- The conference was organized by Panimalar Institute of Technology.\n- The award was given on 14th May 2015.\n- The title of the paper for which the award was received is "Simulation of Unified Power Quality Conditioner for Mitigation of Voltage Sag and Swell".\n- The authors of the paper are S.Satthiyaraj and Dr.S.Sankar.\n- The paper was published in the International Journal of Applied Engineering Research (IJAER).\n- The ISSN of the journal is 0973-4562.\n- The paper is published in Volume 9, Number 22, in the year 2014.'

In [59]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [62]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("can you explain me about the document?")
pretty_print_docs(compressed_docs)



Document 1:

DECLARATION:
----------------------------------------------------------------------------------------------------
Document 2:

"I hereby declare that the details furnished above are true to the best of my knowledge and belief." Place: Puducherry


In [67]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("How many international conferences has published by Dr S.Satthiyaraj")
pretty_print_docs(compressed_docs)






In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.25)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("tell me the total number of national conference has been published by Dr S.Satthiyaraj")
pretty_print_docs(compressed_docs)

In [80]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.6)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)

In [None]:
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("tell me the national conferences has been published by Dr S.Satthiyaraj")
pretty_print_docs(compressed_docs)

In [82]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS

In [None]:
!pip install rank_bm25

In [None]:
!pip install faiss-gpu

In [87]:
bm25_retriever = BM25Retriever.from_texts(texts)
bm25_retriever.k = 2
embedding = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_texts(texts, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [None]:
docs = ensemble_retriever.get_relevant_documents("please tell me the qualification ")
pretty_print_docs(docs)