In [26]:
!pip -q install openai faiss-cpu langchain tiktoken pypdf > /dev/null

In [8]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

#compression classes and modules
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter

#pipeline classes
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

In [4]:
#loading pdf and creating FAISS index

load_data = PyPDFLoader("/content/self-ask.pdf")
split_pages = load_data.load_and_split()
len(split_pages)

31

In [None]:
split_pages[1]

In [31]:
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=150,
                                              chunk_overlap=15,
                                              length_function=len)
two_page_data = []

for page in split_pages[:2]:
  res = doc_splitter.split_text(page.page_content)
  two_page_data.extend(res)

In [32]:
len(two_page_data)

77

In [21]:
import os
os.environ['OPENAI_API_KEY']=''

In [20]:
#helper function from langchain documentation
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [33]:
!rm -fR two_page_db

In [34]:
index_db = FAISS.from_texts(two_page_data, 
                                 OpenAIEmbeddings())

index_db.save_local("two_page_db")
retriever = index_db.as_retriever()

In [35]:
!zip -r two_page_db.zip /content/two_page_db/

updating: content/two_page_db/ (stored 0%)
updating: content/two_page_db/index.faiss (deflated 16%)
updating: content/two_page_db/index.pkl (deflated 56%)


In [36]:
docs = retriever.get_relevant_documents("What is compositionality Gap?")
pretty_print_docs(docs)

Document 1:

We introduce the term compositionality gap to describe the fraction of compositional questions that
----------------------------------------------------------------------------------------------------
Document 2:

We next narrow the compositionality gap by using what we call elicitive prompts . Compositional
----------------------------------------------------------------------------------------------------
Document 3:

MEASURING AND NARROWING
THE COMPOSITIONALITY GAP IN LANGUAGE MODELS
----------------------------------------------------------------------------------------------------
Document 4:

the compositionality gap by reasoning explicitly instead of implicitly. We present


In [37]:
llm = OpenAI(temperature=0)

In [38]:
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, 
                                                       base_retriever=retriever)

In [40]:
compressed_docs = compression_retriever.get_relevant_documents("What is compositionality Gap?")

pretty_print_docs(compressed_docs)

Document 1:

We introduce the term compositionality gap to describe the fraction of compositional questions that
----------------------------------------------------------------------------------------------------
Document 2:

MEASURING AND NARROWING THE COMPOSITIONALITY GAP
----------------------------------------------------------------------------------------------------
Document 3:

the compositionality gap


In [None]:
_filter = LLMChainFilter.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, 
                                                       base_retriever=retriever)

In [51]:
embeddings_filter = EmbeddingsFilter(embeddings=OpenAIEmbeddings(), 
                                     similarity_threshold=0.87)

compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, 
                                                       base_retriever=retriever)

In [52]:
compressed_docs = compression_retriever.get_relevant_documents("What is compositionality Gap?")
pretty_print_docs(compressed_docs)

Document 1:

We introduce the term compositionality gap to describe the fraction of compositional questions that


In [53]:
splitter = CharacterTextSplitter(chunk_size=300, 
                                 chunk_overlap=0, 
                                 separator=". ")

redundant_filter = EmbeddingsRedundantFilter(embeddings=OpenAIEmbeddings())

In [None]:
relevant_filter = EmbeddingsFilter(embeddings=OpenAIEmbeddings(), 
                                   similarity_threshold=0.76)

pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)


In [None]:
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, 
                                                       base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What is compositionality Gap?")