# LangChain: Q&A over Documents (using azure)

* choose the good chain { https://python.langchain.com/v0.1/docs/modules/chains/ }
----> langchain chain is { from langchain.chains import RetrievalQA } : "This chain first does a retrieval step to fetch relevant documents, then passes those documents into an LLM to generate a response."
* if AZURE, instanciate llm { from langchain.chat_models import AzureChatOpenAI }

* load document { from langchain.document_loaders import CSVLoader }
    -> docs = CSVLoader(file_path="...").load()
* embeddings { from langchain.embeddings import AzureOpenAIEmbeddings }
             { from langchain.embeddings import OpenAIEmbeddings }
    -> embeddings = OpenAIEmbeddings() { can call embeddings.embed_query("...) }
* set up a retreiver (vectostore) to fetch document { from langchain.vectorstores import DocArrayInMemorySearch }
    -> db = DocArrayInMemorySearch.from_documents(docs,embeddings)
    -> retriever = db.as_retriever()
* instanciate LLM
    -> llm = ChatOpenAI(temperature = 0.0, model=llm_model)
* all together
----> qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True)
====> qa_stuff.run("...)

In [109]:
# load dataset Luca
import pandas as pd
data = pd.read_csv("data/gutenberg_ebooks.csv")
data.shape

(19929, 17)

In [110]:
# filter for books in english
data = data.loc[data.Language == 'English']
data.shape
# take out books without summary
data = data.loc[~data.Summary.isnull()]
# randomly select 5000 books
import random
rdm_idxs = [random.randint(0,data.shape[0]) for i in range(5000)]
data_to_use = data.iloc[rdm_idxs,:]
data_to_use.shape

(5000, 17)

In [111]:
# load data
data = pd.read_csv("data/gutenberg_ebooks.csv")
data.shape

(19929, 17)

In [112]:
# filter data to thoose with a summary
data = data.loc[~data.Summary.isnull()]
data.shape

(19110, 17)

In [113]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data, page_content_column="Summary")
docs = loader.load()

In [114]:
import os
# from openai import AzureOpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY_4")
azure_openai_api_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT_4")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME_4")

In [115]:
# embeddings
from langchain.embeddings import AzureOpenAIEmbeddings 
# from langchain.embeddings import OpenAIEmbeddings 
# embeddings = OpenAIEmbeddings() 
# embeddings.embed_query(docs[0].page_content)


# USE AZURE embeddings
embeddings = AzureOpenAIEmbeddings() 
embeddings.embed_query(docs[0].page_content)



[0.00461031464982022,
 0.0013799411694363094,
 -0.011305062468682152,
 -0.008636457195676834,
 -0.0015193459607296793,
 0.027641973427659377,
 -0.008218242123304772,
 0.015480566342971154,
 -0.039458186720140336,
 -0.04036099595223066,
 0.00837756255001334,
 0.0282526975390905,
 -0.00031594263720756914,
 -0.015467290339237395,
 -0.010820464583673473,
 0.0097848869323421,
 0.045804420116047004,
 -0.007156111067521973,
 0.0014338775539495374,
 -0.03146564690485814,
 -0.004248526085776407,
 -0.0080257312216005,
 -0.01028276082108454,
 -0.0050783163660084426,
 -0.0005352146865118395,
 -0.005496530507057901,
 0.025570816262351425,
 -0.03189049765079057,
 0.02075139500766778,
 -0.015772652394952957,
 -0.01777742674762429,
 -0.0203398193341465,
 -0.014033413018118508,
 -0.030323854704398454,
 -0.021016928120859455,
 -0.008112029436821664,
 -0.010355782101249341,
 -0.007468112056427021,
 0.020525690836999986,
 0.001176642525168089,
 0.014152902639657981,
 -0.0067080242882569955,
 -0.0064258954

In [116]:
# set up a retreiver (vectostore) to fetch document 
from langchain.vectorstores import DocArrayInMemorySearch 
db = DocArrayInMemorySearch.from_documents(docs,embeddings)
retriever = db.as_retriever()

In [None]:
## LCEL

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vectorstore.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke("What are autonomous agents?")

In [117]:
# instanciate LLM
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(api_key=azure_openai_api_key,
                      azure_deployment=deployment_name,
                      api_version="2023-12-01-preview",
                      azure_endpoint=azure_openai_api_endpoint,
                      temperature=0.9
                      )


In [118]:
# make request
from langchain.chains import RetrievalQA
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True)

qa_stuff.run("what are 5 differents topics of the ensemble of documents?")

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "what are 5 differents topics of the ensemble of documents?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what are 5 differents topics of the ensemble of documents?",
  "context": "\"The Elson Readers, Book 5\" by William H. Elson and Christine M. Keck is an educational reader designed for fifth-grade students, likely written in the early 20th century. This anthology emphasizes quality and variety in children's literature, incorporating American and British classics, notable poems, adventure stories, and ethical themes suitable for classroom use. The overarching goal appears to be enriching children's appreciation for literature through a curated select

'The ensemble of documents covers the following topics:\n\n1. **Children\'s Literature and Education** - "The Elson Readers, Book 5" focuses on enriching children\'s appreciation for literature through a curated selection of engaging texts.\n\n2. **Classical Music Analysis** - "Symphony No. 5 in C minor Opus 67" explores the structure, significance, and elements of Beethoven\'s Fifth Symphony.\n\n3. **Philosophical Dialogue and Religion** - "Colloquium heptaplomeres de rerum sublimium arcanis abditis" delves into existential questions through a dialogue among characters with diverse religious and philosophical perspectives.\n\n4. **Norwegian Cultural and Literary History** - "Gesammelte Werke in fünf Bänden — 1. Band" by Bjørnstjerne Bjørnson presents a collection of poetry and narratives reflecting themes in Norwegian cultural history.\n\n5. **Ethical and Moral Development in Literature** - "The Elson Readers, Book 5" also emphasizes narratives that encourage imagination and moral dev

In [119]:
# check the content of the summary
data_to_use.loc[data_to_use.Title == 'The Lost City','Summary'].values

array([], dtype=object)

In [120]:
qa_stuff.run("do quoi parle le livre 'The Lost City'")

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "do quoi parle le livre 'The Lost City'"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "do quoi parle le livre 'The Lost City'",
  "context": "\"The Lost City\" by Jos. E. Badger, Jr. is a novel written during the late 19th century. The story revolves around Professor Phaeton Featherwit and his two nephews, Waldo and Bruno, as they embark on thrilling adventures that include the exploration of a mysterious city within the Olympic Mountains and the unraveling of nature's marvelous phenomena, including being caught inside a tornado.  The opening of the book introduces the main characters engaging in light banter while observing an ominous storm brewing in t

'Le livre "The Lost City" de Jos. E. Badger, Jr. raconte l\'histoire de l\'exploration d\'une ville mystérieuse dans les Montagnes Olympiques par le Professeur Phaeton Featherwit et ses deux neveux, Waldo et Bruno. Ils se lancent dans des aventures palpitantes, notamment en étant pris à l\'intérieur d\'une tornade. Le récit mêle humour, intrigue scientifique et aventure alors que les personnages utilisent leur machine volante pour naviguer dans des terrains inconnus et potentiellement dangereux.'

In [121]:
qa_stuff.run("who are the personnage and the author of the book 'the lost city'")

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "who are the personnage and the author of the book 'the lost city'"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "who are the personnage and the author of the book 'the lost city'",
  "context": "\"The Lost City\" by Jos. E. Badger, Jr. is a novel written during the late 19th century. The story revolves around Professor Phaeton Featherwit and his two nephews, Waldo and Bruno, as they embark on thrilling adventures that include the exploration of a mysterious city within the Olympic Mountains and the unraveling of nature's marvelous phenomena, including being caught inside a tornado.  The opening of the book introduces the main characters engaging in ligh

'The characters in "The Lost City" are Professor Phaeton Featherwit and his two nephews, Waldo and Bruno. The author of the book is Jos. E. Badger, Jr.'

# LangChain: Evaluation

* create 5 pairs of QA with lanchain { from langchain.evaluation.qa import QAGenerateChain }
----> examples = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model)).  
                  apply_and_parse( [{"doc": t} for t in data[:5]] )
====> QAGenerateChain.from_llm(ChatOpenAI(model=llm_model)).prompt.template :: shwos the prompt used to generate questions and answers
* instanciate choosen chain
    -> qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {"document_separator": "<<<<>>>>>"})

* manual evaluation with langchain.debug = True ( ca.tokens )
----> qa.run(examples[0]["query"]) -- generate context by itself --
* generate examples
----> predictions = qa.apply(examples)
* instanciate QA tester { from langchain.evaluation.qa import QAEvalChain }
====> predictions = QAEvalChain.from_llm(llm).evaluate(examples, predictions)

In [122]:
# generate examples
from langchain.evaluation.qa import QAGenerateChain
examples = QAGenerateChain.from_llm(llm).apply_and_parse( [{"doc": t} for t in docs[:5]] )
examples

[32;1m[1;3m[chain/start][0m [1m[chain:QAGenerateChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[llm/start][0m [1m[chain:QAGenerateChain > llm:AzureChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are a teacher coming up with questions to ask on a quiz. \nGiven the following document, please generate a question and answer based on that document.\n\nExample Format:\n<Begin Document>\n...\n<End Document>\nQUESTION: question here\nANSWER: answer here\n\nThese questions should be detailed and be based explicitly on information in the document. Begin!\n\n<Begin Document>\npage_content='\"L'Assommoir\" by Émile Zola is a novel written during the late 19th century, an era characterized by the realism movement in literature. The book explores the struggles of Gervaise, a laundress trying to build a life for herself and her children amidst the oppressive and often brutal conditions of working-class Paris. The story highlights themes of poverty,



[36;1m[1;3m[llm/end][0m [1m[chain:QAGenerateChain > llm:AzureChatOpenAI] [12.40s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "QUESTION: What are the main themes explored in Émile Zola's \"L'Assommoir,\" and how do they relate to the struggles of the protagonist, Gervaise?\n\nANSWER: The main themes explored in \"L'Assommoir\" include poverty, domestic strife, and the impact of alcoholism on individuals and families. These themes relate to the struggles of the protagonist, Gervaise, as she attempts to build a life for herself and her children in the oppressive conditions of working-class Paris. The narrative highlights her emotional turmoil, the challenges of single motherhood, and the complexities of her relationship with her unfaithful and irresponsible partner, Lantier. These themes underscore the difficulties Gervaise faces in her quest for survival and hope in a challenging social environment.",
        "generation_info": {
          "fi



[{'qa_pairs': {'query': 'What are the main themes explored in Émile Zola\'s "L\'Assommoir," and how do they relate to the struggles of the protagonist, Gervaise?',
   'answer': 'The main themes explored in "L\'Assommoir" include poverty, domestic strife, and the impact of alcoholism on individuals and families. These themes relate to the struggles of the protagonist, Gervaise, as she attempts to build a life for herself and her children in the oppressive conditions of working-class Paris. The narrative highlights her emotional turmoil, the challenges of single motherhood, and the complexities of her relationship with her unfaithful and irresponsible partner, Lantier. These themes underscore the difficulties Gervaise faces in her quest for survival and hope in a challenging social environment.'}},
 {'qa_pairs': {'query': "What are some of the thematic explorations found in Alfred Lord Tennyson's early poems as discussed in John Churton Collins' critical collection?",
   'answer': "The t

#### manual debug

In [123]:
import langchain
langchain.debug = True #( ca.tokens )

In [124]:
qa_stuff.run(examples[0]['qa_pairs']['query'])

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What are the main themes explored in Émile Zola's \"L'Assommoir,\" and how do they relate to the struggles of the protagonist, Gervaise?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What are the main themes explored in Émile Zola's \"L'Assommoir,\" and how do they relate to the struggles of the protagonist, Gervaise?",
  "context": "\"L'Assommoir\" by Émile Zola is a novel written during the late 19th century, known for its vivid exploration of working-class life in Paris. The story centers around Gervaise, a washerwoman, who grapples with love, abandonment, and the harsh realities of poverty amidst the struggles of family life and societal expectatio

'"L\'Assommoir" by Émile Zola explores several main themes:\n\n1. **Poverty**: The novel vividly depicts the harsh conditions of working-class life in Paris. Gervaise\'s struggle to provide for her family amidst economic hardship is central to the narrative.\n\n2. **Alcoholism**: Zola critiques the destructive impact of alcoholism on individuals and families. This theme is intertwined with Gervaise\'s experiences as she witnesses the devastation it brings.\n\n3. **Domestic Strife**: Gervaise faces personal and emotional turmoil through her relationships, particularly with her partner Lantier, highlighting issues of love, betrayal, and abandonment.\n\n4. **Societal Critique**: The novel reflects on societal expectations and class struggles, portraying the systemic issues that hinder the protagonist’s attempts to improve her life.\n\nThese themes are deeply connected to Gervaise\'s struggles as she navigates love, family, and societal pressures while battling despair and holding onto hop

In [125]:
langchain.debuf = False

#### automatic debug

In [126]:
ex = [a['qa_pairs'] for a in examples]

In [127]:
predictions = qa_stuff.apply(ex)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What are the main themes explored in Émile Zola's \"L'Assommoir,\" and how do they relate to the struggles of the protagonist, Gervaise?",
  "answer": "The main themes explored in \"L'Assommoir\" include poverty, domestic strife, and the impact of alcoholism on individuals and families. These themes relate to the struggles of the protagonist, Gervaise, as she attempts to build a life for herself and her children in the oppressive conditions of working-class Paris. The narrative highlights her emotional turmoil, the challenges of single motherhood, and the complexities of her relationship with her unfaithful and irresponsible partner, Lantier. These themes underscore the difficulties Gervaise faces in her quest for survival and hope in a challenging social environment."
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:


In [128]:
# instanciate QA tester 
from langchain.evaluation.qa import QAEvalChain 
predictions = QAEvalChain.from_llm(llm).evaluate(ex, predictions)

predictions

[32;1m[1;3m[chain/start][0m [1m[chain:QAEvalChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "query": "What are the main themes explored in Émile Zola's \"L'Assommoir,\" and how do they relate to the struggles of the protagonist, Gervaise?",
      "answer": "The main themes explored in \"L'Assommoir\" include poverty, domestic strife, and the impact of alcoholism on individuals and families. These themes relate to the struggles of the protagonist, Gervaise, as she attempts to build a life for herself and her children in the oppressive conditions of working-class Paris. The narrative highlights her emotional turmoil, the challenges of single motherhood, and the complexities of her relationship with her unfaithful and irresponsible partner, Lantier. These themes underscore the difficulties Gervaise faces in her quest for survival and hope in a challenging social environment.",
      "result": "The main themes in Émile Zola's \"L'Assommoir\" include poverty, d



[{'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'}]

In [129]:
examples[0]['qa_pairs']

{'query': 'What are the main themes explored in Émile Zola\'s "L\'Assommoir," and how do they relate to the struggles of the protagonist, Gervaise?',
 'answer': 'The main themes explored in "L\'Assommoir" include poverty, domestic strife, and the impact of alcoholism on individuals and families. These themes relate to the struggles of the protagonist, Gervaise, as she attempts to build a life for herself and her children in the oppressive conditions of working-class Paris. The narrative highlights her emotional turmoil, the challenges of single motherhood, and the complexities of her relationship with her unfaithful and irresponsible partner, Lantier. These themes underscore the difficulties Gervaise faces in her quest for survival and hope in a challenging social environment.'}