### LangChain intro - get SOURCES when asking a LLM

This notebook is used in the [youtube video](https://www.youtube.com/watch?v=v-TPkKR2Ltk) "LangChain intro - get SOURCES when asking a LLM" to introduce the reader into training an LLM on pdfs in order to be able to query the index with questions and obtain sources in addition to answers.

In [None]:
_ = !pip install langchain
_ = !pip install unstructured
_ = !pip install openai
_ = !pip install pybind11
_ = !pip install chromadb
_ = !pip install Cython
_ = !pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI"
_ = !pip install unstructured[local-inference]
_ = !pip install 'git+https://github.com/facebookresearch/detectron2.git'
# on Mac
#_ = !CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git'
_ = !pip install layoutparser[layoutmodels,tesseract]
_ = !pip install pytesseract

In [None]:
import itertools
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
# to obtain a free API key, create an account at https://openai.com/ 

In [None]:
# if cpu is used, it MUST be set in the configuration of detectron2
from detectron2.config import get_cfg
cfg = get_cfg()    
cfg.MODEL.DEVICE = 'cpu' # GPU is recommended

In [None]:
# make and populate folder with pdf files to search on
!mkdir input_pdfs
!ls input_pdfs/

In [None]:
text_folder = 'input_pdfs'
loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]

In [None]:
# time consuming step
documents = [loaders[i].load() for i in range(len(loaders))]

In [None]:
documents = list(itertools.chain.from_iterable(documents))

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [None]:
# look at content of a document
docs[0]

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
# build the vector store
# the persist_directory is optional; it is used for persisting the vector_store locally for later re-use
vector_store = Chroma.from_documents(docs, embeddings, persist_directory='chroma_db_folder')

In [None]:
chain = VectorDBQAWithSourcesChain.from_chain_type(OpenAI(temperature=0), chain_type="stuff", vectorstore=vector_store)

In [None]:
chain({"question": "What are recent developments in the field of photovoltaics"}, return_only_outputs=True)

In [None]:
chain({"question": "How are indoor photovoltaic applications characterized?"}, return_only_outputs=True)

In [None]:
chain({"question": "Tell me more about indoor photovoltaic applications"}, return_only_outputs=True)

In [None]:
chain({"question": "Tell me about perovskite solar cell devices"}, return_only_outputs=True)

In [None]:
chain({"question": "Who is the Prime Minister of India?"}, return_only_outputs=True)

In [None]:
chain({"question": "Who is the President of the United States of America"}, return_only_outputs=True)