# LangChain: Q&A over Documents

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file

In [1]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain import HuggingFacePipeline

from IPython.display import display, Markdown

In [None]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=HuggingFaceEmbeddings()
).from_loaders([loader])

query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

response = index.query(query)
display(Markdown(response))

In [2]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)
documents = loader.load()
documents[0]

Document(page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 0})

In [3]:
# embeddings = OpenAIEmbeddings()
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
embed = embeddings.embed_query("Hi my name is Harrison")
print(len(embed))
print(embed[:5])

768
[0.0333668515086174, -0.004217796493321657, 0.0055653126910328865, -0.007028068881481886, 0.010666920803487301]


In [4]:
text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings,
)

Created a chunk of size 797, which is longer than the specified 700
Created a chunk of size 720, which is longer than the specified 700
Created a chunk of size 741, which is longer than the specified 700
Created a chunk of size 938, which is longer than the specified 700
Created a chunk of size 853, which is longer than the specified 700
Created a chunk of size 777, which is longer than the specified 700
Created a chunk of size 913, which is longer than the specified 700
Created a chunk of size 705, which is longer than the specified 700
Created a chunk of size 871, which is longer than the specified 700
Created a chunk of size 706, which is longer than the specified 700
Created a chunk of size 779, which is longer than the specified 700
Created a chunk of size 1005, which is longer than the specified 700
Created a chunk of size 792, which is longer than the specified 700
Created a chunk of size 829, which is longer than the specified 700
Created a chunk of size 705, which is longer th

In [5]:
query = "Please suggest a shirt with sunblocking"
docs = db.similarity_search(query)
docs

[Document(page_content=': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. \n\nSize & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.\n\nFabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.\n\nAdditional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.', metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 255}),
 Document(page_content=": 679\nname: Women's Tropical Tee, Sleeveless\ndescription: Our five-star sleeveless button-up shirt has a fit to flatter and SunSmart™ protection to block the sun’s harmful UV rays. Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip. Fabric & Care: Shell: 71% nylon, 29% polyester. Cape lining: 100% polyester. Built-in SunSmart™ U

In [6]:
# task supports "text-generation", "text2text-generation", "summarization"
llm = HuggingFacePipeline.from_model_id(
    model_id="roberta-large",
    task="text-generation",
    device=0,
    model_kwargs={"temperature": 0, "is_decoder": True, "max_length": 1000},
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
# llm = ChatOpenAI(temperature = 0.0)

# qdocs = "".join([docs[i].page_content for i in range(len(docs))])
# response = llm.call_as_llm(f"{qdocs} Question: Please list all your \
# shirts with sun protection in a table in markdown and summarize each one.") 

# display(Markdown(response))

In [7]:
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("List all shirts with sun protection in a table in markdown and summarize each one.")
docs

[Document(page_content=": 679\nname: Women's Tropical Tee, Sleeveless\ndescription: Our five-star sleeveless button-up shirt has a fit to flatter and SunSmart™ protection to block the sun’s harmful UV rays. Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip. Fabric & Care: Shell: 71% nylon, 29% polyester. Cape lining: 100% polyester. Built-in SunSmart™ UPF 50+ rated – the highest rated sun protection possible. Machine wash and dry. Additional Features: Updated design with smoother buttons. Wrinkle resistant. Low-profile pockets and side shaping offer a more flattering fit. Front and back cape venting. Two front pockets, tool tabs and eyewear loop. Imported. Sun Protection That Won't Wear Off: Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays.", metadata={'source': 'OutdoorClothingCatalog_1000.csv', 'row': 679}),
 Document(page_content=': 255\nname: Sun Shield Shirt by\ndescription: "Block the sun, not the fun – our high-p

In [None]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",  # map_reduce, refine, map_rerank
    retriever=retriever, 
    verbose=True
)

query =  "Please list all your shirts with sun protection in a table \
in markdown and summarize each one."
response = qa_stuff.run(query)
display(Markdown(response))

In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings,
    text_splitter=CharacterTextSplitter(chunk_size=700, chunk_overlap=0)
).from_loaders([loader])

response = index.query(query, llm=llm)
display(Markdown(response))

: 