<a href="https://colab.research.google.com/github/RaminParker/langchain_chrash_course/blob/main/Question_a_pdf_book.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Credits to this [YouTube video](https://www.youtube.com/watch?v=h0DHDp1FbmQ). Code is [here](https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Ask%20A%20Book%20Questions.ipynb).

A possible app visualization is shown [here](https://youtu.be/ih9PBGVVOO4?t=121)

In [None]:
!pip -q install langchain openai tiktoken

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load your data

In [None]:
# Mount your Google Drive on your Google Colab runtime
from google.colab import drive
drive.mount('/content/drive/')

path_to_key = "/content/drive/MyDrive/Colab Notebooks/configs/openai/token.txt" 
file1 = open(path_to_key, "r")
token_oai = file1.read()

path_to_key = "/content/drive/MyDrive/Colab Notebooks/configs/pinecone/token.txt" 
file1 = open(path_to_key, "r")
PINECONE_API_KEY  = file1.read()

In [None]:
PINECONE_API_ENV = 'us-west1-gcp-free'
OPENAI_API_KEY = token_oai

In [None]:
import os

os.environ["OPENAI_API_KEY"] = token_oai

# Load your data

In [None]:
#!pip install unstructured
!pip install unstructured[local-inference]

In [None]:
!pip install pdfminer

In [None]:
loader = UnstructuredPDFLoader("/content/drive/MyDrive/Colab Notebooks/datasets/pdf/TeamBank_GB_2021.pdf")
# loader = OnlinePDFLoader("https://www.teambank.de/geschaeftsbericht2021/img/TeamBank_GB_2021.pdf")

In [None]:
data = loader.load()

In [None]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')


# Chunk your data up into smaller documents

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [None]:
print (f'Now you have {len(texts)} documents')

In [None]:
texts[3]

# Create embeddings of your documents to get ready for semantic search

In [None]:
!pip install pinecone-client

In [None]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

The [Pinecone](https://www.pinecone.io/) vector database makes it easy to build high-performance vector search applications. Developer-friendly, fully managed, and easily scalable without infrastructure hassles.

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

Create a pinecone index --> [video tutorial](https://youtu.be/h0DHDp1FbmQ?t=393)

In [None]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
query = "Wie begeistern wir unsere Kunden?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [None]:
docs

# Query those docs to get your answer back

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "Wie begeistern wir unsere Kunden?"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)

In [None]:
query = "Wie ist die gesamtwirtschaftliche Entwicklung?"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)

In [None]:
query = "Was sind die Ziele der TeamBank"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)

In [None]:
query = "Wie ist das Provisionsergebnis der TeamBank?"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)