# Project: Question-Answering on Private Documents

## Here is all the package needed

In [35]:
pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [36]:
pip install Cohere

Note: you may need to restart the kernel to use updated packages.


In [37]:
!pip install pypdf -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [38]:
import pypdf

In [39]:
!pip install docx2txt -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [40]:
import docx2txt

In [41]:
import os
from dotenv import load_dotenv, find_dotenv
from pinecone import Pinecone
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv(find_dotenv(), override=True)

# You can add other package here if needed


True

### Loading Documents

In [42]:
# The goal is to loading PDF, DOCX and TXT files as LangChain Documents
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.text import TextLoader
def load_document(file):

    name, extension = os.path.splitext(file) # check splitext()

    if extension == '.pdf':
        # Find how to load PDF document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        # Find how to load DOCX document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        # Find how to load TXT document with https://python.langchain.com/docs/modules/data_connection/document_loaders/
        loader= TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()

    # Verification
    if len(data) != 0:
        return data
    else:
        print('Document is empty')
        return None
  

#### Verification 

In [44]:
pdf_file="./documents/churchill_speech.pdf"
docx_file="./documents/churchill_speech.docx"
text_file="./documents/churchill_speech.txt"

print(load_document(pdf_file))
#print(load_document(docx_file))
#print(load_document(text_file))

Ignoring wrong pointing object 6 0 (offset 0)


Loading ./documents/churchill_speech.pdf
[Document(page_content="Winston Churchill Speech - We Shall Fight on the Beaches We Shall Fight on the Beaches June 4, 1940 House of Commons From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the second week of May, only a rapid retreat to Amiens and the south could have saved the BriGsh and French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was not immediately realized. The French High Command hoped they would be able to close the gap, and the Armies of the north were under their orders. Moreover, a reGrement of this kind would have involved almost certainly the destrucGon of the ﬁne Belgian Army of over 20 divisions and the abandonment of the whole of Belgium. Therefore, when the force and scope of the German penetraGon were realized and when a new French Generalissimo, General Weygand, assumed command in place of General Gamelin, an eﬀort was made by the F

### Chunking Data

In [54]:
from langchain_text_splitters import CharacterTextSplitter
def chunk_data(data, chunk_size=256):
    # Find how to load a built-in document transformers that make it easy to split with https://python.langchain.com/docs/modules/data_connection/document_transformers/
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256, # maximum size of text chunk 
        chunk_overlap=10, # Specifies overlapping characters between chunks.
    )
    # Split the text
    chunks = text_splitter.create_documents([data])
    
    # Verification
    if len(chunks) != 0:
        print(f'Now you have {len(chunks)}')
        return chunks
    else:
        print('Document is not split')
        return None
    

#### Verification

In [55]:
chunks = chunk_data(str(load_document(text_file)))
print(type(chunks))

Now you have 88
<class 'list'>


### Embedding and Uploading to a Vector Database (Pinecone)

In [57]:
from langchain.embeddings import CohereEmbeddings

# Embeddings instance
embeddings = CohereEmbeddings()

# Take text chunk 
chunk = chunks[20]

# Embed the text into a vector 
vector = embeddings.embed_query(chunk.page_content)

# Print the chunk
print(chunk.page_content)

and had they not sought refuge in what was proved to be a fatal neutrality, the French and\nBritish Armies might well at the outset have saved not only Belgium but perhaps even Poland. Yet at\nthe last moment, when Belgium was already invaded, King


In [59]:
def insert_or_fetch_embeddings(index_name, chunks):
    from pinecone import Pinecone, PodSpec
    from langchain.vectorstores import Pinecone as Pinecone_langchain
    from langchain.embeddings import CohereEmbeddings

    # Find how to create embeddings instance 
    # embeddings = ?
    embeddings = CohereEmbeddings()

    # Initialize Pinecone  with API key 
    # pinecone = ?
    pinecone = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
    
    if index_name in pinecone.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        # Create index
        # TO DO
        # Index the text chunks into Pinecone
        # TO DO - use chunks/embeddings/index_name
        pinecone.create_index(index_name, dimension=4096, metric='cosine',spec=PodSpec(environment="gcp-starter"))
        vector_store = Pinecone_langchain.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

#### Verification

In [69]:
from pinecone import Pinecone, PodSpec
from langchain.vectorstores import Pinecone as Pinecone_langchain
from langchain.embeddings import CohereEmbeddings

index_name="ajnaki-test-api"

vectorstore = insert_or_fetch_embeddings(index_name, chunks)

Creating index ajnaki-test-api and embeddings ...

ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'x-pinecone-api-version': '2024-04', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': 'c81b30bdd08e92e3b64297039991d3bf', 'Date': 'Wed, 01 May 2024 20:17:38 GMT', 'Server': 'Google Frontend', 'Content-Length': '142', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"FORBIDDEN","message":"Request failed. Your free plan supports 1 starter indexes. Use a different index type."},"status":403}


In [70]:
type(vectorstore)

NameError: name 'vectorstore' is not defined

In [71]:
# can be helpful
def delete_pinecone_index(index_name='all'):
    
    if index_name == 'all':
        indexes = pinecone.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [72]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatCohere
    from langchain_community.llms import Cohere


    # llm = ?
    llm = Cohere(temperature=0.75, cohere_api_key=os.environ.get('COHERE_API_KEY'))
    # retriever = ?
    # Create retriever from vector store
    retriever = vector_store.as_retriever(
        # specifies to use semantic similarity search against the Pinecone index
        search_type='similarity', 
        # Here we set k=10 to retrieve the top 10 most similar results
        search_kwargs={'k': 10})
    # chain = ?
    chain = RetrievalQA.from_chain_type(llm=llm, 
                                # builds a "stuff" chain that retrieves context for questions
                                # you can have more details with - https://chat.langchain.com/
                                chain_type="stuff", 
                                retriever=retriever)
    answer = chain.invoke(q)
    return answer

### Running Code in order to complete this project

In [74]:
data = load_document(text_file)

print(f'You have {len(data)} pages in your data')
#print(f'There are {len(data[10].page_content)} characters in the page')

You have 1 pages in your data


In [76]:
index_name = 'ajnaki-test-api'
vector_store = insert_or_fetch_embeddings(index_name)

TypeError: insert_or_fetch_embeddings() missing 1 required positional argument: 'chunks'

In [77]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

NameError: name 'vector_store' is not defined

In [None]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

In [None]:
delete_pinecone_index()