# Project: Question-Answering on Private Documents

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [2]:
pip install pypdf -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


In [5]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported.')
        return None
    
    data = loader.load()
    return data

def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [6]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

### Calculating Cost

In [7]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f"Total tokens: {total_tokens}")
    print(f"Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}")

### Embedding and Uploading to a Vector Database (Pinecone)

In [8]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [9]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [10]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')

### Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.run(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=0.1)
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    
    return result, chat_history

### Running Code

In [11]:
data = load_document('https://www.govinfo.gov/content/pkg/CDOC-110hdoc50/pdf/CDOC-110hdoc50.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page.')

Loading https://www.govinfo.gov/content/pkg/CDOC-110hdoc50/pdf/CDOC-110hdoc50.pdf
You have 85 pages in your data
There are 3683 characters in the page.


In [12]:
data = load_document('files/proposal.docx')
print(data[0].page_content)

Loading files/proposal.docx
Your Name

123 Bakersville

Bread County, 8888

youremail@gmail.com

www.yourwebsite.com







Tuesday, 5 May, 2020

Client’s Name

Owner

Company Name





Dear Name,

Thanks for discussing your business with me yesterday. I’ve compiled 3 options I believe will help achieve the business goals we’ve discussed.

Please review my proposal and let me know if you have any questions or comments. I will contact you next week Tuesday if I haven’t heard from you by then.



Regards,

Your Name





















PROJECT OVERVIEW

COMPANY (replace with the company name) would like to improve their existing website to focus on generating more free consultations which will result in more paying clients.

You recognize the importance of a conversion-centered website and thus the reason for a professional redesign with this purpose in mind.

Over 50% of your website visitors are viewing the website on their mobile device. The website will primarily be focused on a cle

In [13]:
data = load_from_wikipedia('Generative AI')
print(data[0].page_content)

Generative artificial intelligence (also generative AI or GenAI) is artificial intelligence capable of generating text, images, or other media, using generative models. Generative AI models learn the patterns and structure of their input training data and then generate new data that has similar characteristics.In the early 2020s, advances in transformer-based deep neural networks enabled a number of generative AI systems notable for accepting natural language prompts as input. These include large language model chatbots such as ChatGPT, Copilot, Bard, and LLaMA, and text-to-image artificial intelligence art systems such as Stable Diffusion, Midjourney, and DALL-E.Generative AI has uses across a wide range of industries, including art, writing, script writing, software development, product design, healthcare, finance, gaming, marketing, and fashion. Investment in generative AI surged during the early 2020s, with large companies such as Microsoft, Google, and Baidu as well as numerous sm

In [14]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[5].page_content)

37
== History ==


In [15]:
print_embedding_cost(chunks)

Total tokens: 1642
Embedding Cost in USD: 0.000657


In [16]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ... 
Ok


In [17]:
index_name = 'askdocument'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index askdocument and embeddings ...Ok


In [19]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

The document appears to be discussing the history and development of the concept of artificial beings with human-like intelligence, including its exploration in myth, fiction, and philosophy throughout history, and how it became a scientific topic with Alan Turing's paper in 1950. It also mentions the raised philosophical and ethical arguments regarding the nature of the human mind and the consequences of creating such beings.


In [None]:
import time
i = 1
print('Write Quit or Exit to quit')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting... bye bye!')
        time.sleep(2)
        break
        
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit
Question #1: How many amendments are in the us constitution?

Answer: The passage provided does not contain any information related to the number of amendments in the US Constitution. I'm sorry, but I don't know the answer to your question.

 -------------------------------------------------- 

Question #2: How many amendments are in the U.S. constitution?

Answer: There are 27 amendments in the U.S. Constitution.

 -------------------------------------------------- 



In [23]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [24]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index chatgpt and embeddings ...Ok


In [25]:
q = 'Ce este ChatGPT?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

ChatGPT este un sistem de asistență virtuală bazat pe inteligența artificială dezvoltat de OpenAI. Este o versiune îmbunătățită a modelului GPT-3, care a fost ajustată și optimizată pentru a oferi răspunsuri mai precise și mai relevante la întrebările și solicitările utilizatorilor. Este concepută pentru a simula o conversație reală cu un om și poate fi utilizată într-o varietate de scopuri, inclusiv pentru educație și suport tehnic.


In [None]:
# asking with memory
chat_history = []
question = 'How many amendments are in the U.S constitution?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)