# LangChain: Q&A over Documents

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain import HuggingFacePipeline

from IPython.display import display, Markdown

In [None]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=HuggingFaceEmbeddings()
).from_loaders([loader])

query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

response = index.query(query)
display(Markdown(response))

In [None]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)
documents = loader.load()
documents[0]

In [None]:
# embeddings = OpenAIEmbeddings()
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
embed = embeddings.embed_query("Hi my name is Harrison")
print(len(embed))
print(embed[:5])

In [None]:
# text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=0)
# docs = text_splitter.split_documents(documents)

db = DocArrayInMemorySearch.from_documents(
    documents, 
    embeddings,
)

In [None]:
query = "Please suggest a shirt with sunblocking"
docs = db.similarity_search(query)
docs

In [None]:
# task supports "text-generation", "text2text-generation", "summarization"
llm = HuggingFacePipeline.from_model_id(
    model_id="facebook/opt-125m",
    task="text-generation",
    device=0,
    model_kwargs={"temperature": 0, "is_decoder": True, "max_length": 100},
)

In [None]:
# llm = ChatOpenAI(temperature = 0.0)

# qdocs = "".join([docs[i].page_content for i in range(len(docs))])
# response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
# shirts with sun protection in a table in markdown and summarize each one.") 

# display(Markdown(response))

In [None]:
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("List all shirts with sun protection in a table in markdown and summarize each one.")
docs

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",  # map_reduce, refine, map_rerank
    retriever=retriever, 
    verbose=True
)

query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."
response = qa_stuff.run(query)
display(Markdown(response))

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])

response = index.query(query, llm=llm)
display(Markdown(response))