In [None]:
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install faiss-gpu
!pip install langchain_experimental
!pip install "langchain[docarray]"
!pip install pylcs
!pip3 install pypdf

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import  PyPDFLoader
from langchain.vectorstores import  FAISS
from langchain.text_splitter import  RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings  # Remove HuggingFaceInstructEmbeddings if not used
from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import LLMChain
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from time import monotonic


import pylcs
import textwrap
import os
import re
import copy
import tiktoken
import ast
import PyPDF2
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_similarity
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [3]:
# Prompt the user for their OpenAI API key
api_key = input("Please enter your OpenAI API key: ")

In [4]:
# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = api_key
# Optionally, check that the environment variable was set correctly
print("OPENAI_API_KEY has been set!")

OPENAI_API_KEY has been set!


In [5]:
hp_pdf_path ="Harry_Potter_Book_1_The_Sorcerers_Stone.pdf"

In [7]:
def split_into_chapters(book_path):
    with open(book_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        documents = pdf_reader.pages
        text = " ".join([doc.extract_text() for doc in documents])

        # Adjust the pattern as needed based on the chapter titles' formatting
        chapters = re.split(r'(CHAPTER\s[A-Z]+(?:\s[A-Z]+)*)', text)

        # Create Document objects with only chapter metadata
        chapter_docs = []
        chapter_num = 1
        for i in range(1, len(chapters), 2):
            chapter_text = chapters[i] + chapters[i + 1]
            doc = Document(page_content=chapter_text, metadata={"chapter": chapter_num})
            chapter_docs.append(doc)
            chapter_num += 1

    return chapter_docs

In [8]:
chapters = split_into_chapters(hp_pdf_path) # with additional metadata of chapter

In [9]:
print(len(chapters))

17


In [10]:
def replace_t_with_space(list_of_documents):
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')
    return list_of_documents

In [11]:
chapters = replace_t_with_space(chapters)

In [12]:
summarization_prompt_template = """Write an extensive summary of about of the following:

{text}

SUMMARY:"""

summarization_prompt = PromptTemplate(template=summarization_prompt_template, input_variables=["text"])

In [13]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [14]:
def replace_double_lines_with_one_line(text):
  cleaned_text = re.sub(r'\n\n', '\n', text)
  return cleaned_text

In [15]:
def create_chapter_summary(chapter):
  chapter_txt = chapter.page_content
  model_name = "gpt-3.5-turbo-0125"
  llm = ChatOpenAI(temperature=0, model_name=model_name)

  gpt_35_turbo_max_tokens = 16000
  verbose = False

  num_tokens = num_tokens_from_string(chapter_txt, model_name)

  if num_tokens < gpt_35_turbo_max_tokens:
    chain = load_summarize_chain(llm, chain_type="stuff", prompt=summarization_prompt, verbose=verbose)
  else:
    chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=summarization_prompt, combine_prompt=summarization_prompt, verbose=verbose)

  start_time = monotonic()
  doc_chapter = Document(page_content=chapter_txt)
  summary = chain.invoke([doc_chapter])


  print(f"Chain type: {chain.__class__.__name__}")
  print(f"Run time: {monotonic() - start_time}")
  summary = replace_double_lines_with_one_line(summary["output_text"])
  doc_summary = Document(page_content=summary, metadata=chapter.metadata)
  return doc_summary

In [18]:
chapter_summaries = []
for chapter in chapters:
    chapter_summaries.append(create_chapter_summary(chapter))

Chain type: StuffDocumentsChain
Run time: 5.765999999945052
Chain type: StuffDocumentsChain
Run time: 5.077999999979511
Chain type: StuffDocumentsChain
Run time: 9.577999999979511
Chain type: StuffDocumentsChain
Run time: 5.10999999998603
Chain type: StuffDocumentsChain
Run time: 5.60999999998603
Chain type: StuffDocumentsChain
Run time: 5.968999999924563
Chain type: StuffDocumentsChain
Run time: 5.75
Chain type: StuffDocumentsChain
Run time: 7.765999999945052
Chain type: StuffDocumentsChain
Run time: 5.125
Chain type: StuffDocumentsChain
Run time: 7.109000000054948
Chain type: StuffDocumentsChain
Run time: 7.468999999924563
Chain type: StuffDocumentsChain
Run time: 6.655999999959022
Chain type: StuffDocumentsChain
Run time: 7.547000000020489
Chain type: StuffDocumentsChain
Run time: 8.780999999959022
Chain type: StuffDocumentsChain
Run time: 6.420999999972992
Chain type: StuffDocumentsChain
Run time: 7.26500000001397
Chain type: StuffDocumentsChain
Run time: 5.422000000020489


In [19]:
def encode_book(path,chunk_size=1000,chunk_overlap=200):
    loader = PyPDFLoader(path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len)
    texts = text_splitter.split_documents(documents)
    cleaned_texts = copy.deepcopy(texts)
    for text in cleaned_texts:
        text.page_content = text.page_content.replace('\t', ' ')  # Replace tab characters with spaces
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)
    return vectorstore

In [20]:
def encode_chapter_summaries(chapter_summaries):
  embeddings = OpenAIEmbeddings()
  chapter_summaries_vectorstore = FAISS.from_documents(chapter_summaries, embeddings)
  return chapter_summaries_vectorstore

In [21]:
modifier_prompt_template = """
You are an expert on analyzing and understanding books based solely on their textual content. You have no prior knowledge about the specific book in the question.
If you ever read the related book before, FORGET ANYTHING about it.
You have access to two vector stores:

• One containing all the book content divided into chunks of 1000 characters with 200 character overlap.
• Another containing summaries of each chapter, approximately 250 tokens each.

Given a user's question about the book, your task is to generate a list of no more than 3 sub-questions that can be used as queries to retrieve relevant information from the vector stores based on semantic similarity. These sub-questions should be designed to collectively cover all the information needed to answer the original question comprehensively, without relying on any prior knowledge of the book's plot, characters, or events.

When generating the sub-questions, consider the following:


1.No Pre-knowledge: you're unaware of the book's details like plot or characters. The sub-questions must not present any prior knowledge of the book.
2.Directly Derived: Create sub-questions strictly from the user's question, aiming to retrieve book information without presupposing specific plot details or events.
3.Break Down: Decompose the user's question into finer, detailed sub-questions using only the textual content provided.
4.Key Concepts: Identify essential information needed from the original question and form targeted sub-questions to gather this data.
5.Self-contained Queries: Each sub-question should stand alone for effective vector store querying through semantic similarity.
6.Logical Sequence: Arrange sub-questions in a logical order that collectively provides a thorough answer.
7.Efficiency: Ensure sub-questions are unique and focused to avoid redundant searches and streamline information retrieval.

Output your response as a Python list, where each item is a self-contained sub-question that can be used as a standalone query for vector retrieval.

User's question: {question}
"""
modifier_prompt = PromptTemplate(
input_variables=["question"],
template=modifier_prompt_template,
)

llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview", max_tokens=4000)

question_modifier_llm_chain = LLMChain(llm=llm, prompt=modifier_prompt)

In [22]:
def modify_question(question):
    result = question_modifier_llm_chain.invoke(input=question)
    modified_questions_str = result['text']
    print("Original Output:", modified_questions_str)

    # Remove the Markdown code block syntax and the variable assignment part
    clean_str = modified_questions_str.replace("```python", "").replace("```", "").strip()
    clean_str = clean_str.replace("sub_questions = ", "").strip()

    # Dedent the string to remove unexpected indents
    clean_str = textwrap.dedent(clean_str)

    try:
        modified_questions = ast.literal_eval(clean_str)
    except SyntaxError as e:
        print(f"Syntax error during ast.literal_eval: {e}")
        modified_questions = []  # Default to an empty list in case of error

    return modified_questions

In [23]:
question = "how did harry beat quirrell?"
modified_question = modify_question(question)

Original Output: ```python
[
    "What methods or strategies does Harry use to confront adversaries?",
    "What are the specific events leading to the defeat of Quirrell?",
    "What unique abilities or tools does Harry possess that aid in overcoming challenges?"
]
```


In [23]:
def create_context_per_question(question, multi_query_retriever, multi_query_retriever_chapter_summaries):
    # Retrieve relevant documents based on the question
    docs = multi_query_retriever.get_relevant_documents(question)
    docs_summaries = multi_query_retriever_chapter_summaries.get_relevant_documents(question)

    # Concatenate the relevant content with citation information
    context = " ".join(doc.page_content for doc in docs)

    # context = " ".join(f"{doc.page_content} (Chapter {doc.metadata['chapter']}, Page {doc.metadata['page']})" for doc in docs)
    context_summaries = "".join(f"{doc.page_content} (Chapter {doc.metadata['chapter']})" for doc in docs_summaries)

    return context, context_summaries

In [24]:
def is_similarity_ratio_lower_than_th(large_string, short_string, th):
  lcs = pylcs.lcs_sequence_length(large_string,short_string)
  if lcs / len(short_string) < th:
    return True
  return False

In [25]:
def answer_question_pipline(question,retriever,chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm):
        multi_query_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=multi_query_retriver_llm)
        multi_query_retriever_chapter_summaries = MultiQueryRetriever.from_llm(retriever=chapter_summaries_retriever, llm=multi_query_retriver_llm)

        modified_questions = modify_question(question)
        all_context_book = ""
        all_context_summaries = ""
        similarity_th = 0.5


        for modified_question in modified_questions:
            curr_question_relevant_context, curr_question_relevant_summaries_context = create_context_per_question(modified_question, multi_query_retriever, multi_query_retriever_chapter_summaries)

            if is_similarity_ratio_lower_than_th(all_context_book, curr_question_relevant_context, similarity_th):
                all_context_book += curr_question_relevant_context
            if is_similarity_ratio_lower_than_th(all_context_summaries, curr_question_relevant_summaries_context, similarity_th):
                all_context_summaries += curr_question_relevant_summaries_context

        all_context = all_context_book + all_context_summaries

        input_data = {
        "context": all_context,  # Your context string
        "question": question  # Your question string
        }

        # Execute the chain and get the response
        result = answer_from_context_llm_chain.invoke(input=input_data)
        return result, all_context_book, all_context_summaries


In [102]:
def chat_with_data(retriever,chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm):
    print("You can start chatting with me about Harry Potter. Type 'exit' to stop.")
    while True:
        # Prompt the user for a question
        question = input("What's your question? \n")

        # Check if the user wants to exit
        if question.lower() == 'exit':
            print("Exiting chat. Goodbye!")
            break

        # Execute the chain and get the response
        result, _, _ = answer_question_pipline(question,retriever,chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm)

        # Print the response

        print("Answer:")
        wrapped_result = textwrap.fill(result['text'], width=120)
        print(wrapped_result)
        print("-" * 80)  # Print a separator line for readability

In [26]:
vector_store = encode_book(hp_pdf_path,chunk_size=1000,chunk_overlap=200)

  warn_deprecated(


In [27]:
chapter_summaries_vector_store = encode_chapter_summaries(chapter_summaries)

In [28]:
retriever=vector_store.as_retriever(search_kwargs={"k": 4})

In [29]:
chapter_summaries_retriever = chapter_summaries_vector_store.as_retriever(search_kwargs={"k": 4})

In [30]:
agent_answer_prompt_template = """
Based solely on the information provided in this context, and without using any information outside of this context, please answer the following question as concisely and as shortly as possible. You can rephrase the question for better fitting to the context.

Context:{context}
Question:{question}

**If the answer cannot be derived from the context, or if it requires knowledge from outside sources, simply answer: "I don't know".**

Please cite specific parts of the context in your answer to demonstrate how it supports your response.
If the chapter number of the relevant context appears, specify it in your answer.
"""
agent_answer_prompt = PromptTemplate(
input_variables=["context", "question"],
template=agent_answer_prompt_template,
)

multi_query_retriver_llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview", max_tokens=4000)

answer_from_context_llm_chain = LLMChain(llm=llm, prompt=agent_answer_prompt)

In [79]:
question = 'how did harry beat quirrell?'

In [103]:
result, all_context_book, all_context_summaries = answer_question_pipline(question,retriever,chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm)

Original Output: ```python
[
    "Which character in the book has an owl as a pet?",
    "What are the names of the pets mentioned in the story?",
    "How is Harry's pet owl referred to or addressed in the text?"
]
```


In [98]:
all_context



In [107]:
wrapped_all_context_book = textwrap.fill(all_context_book, width=120)
print(f' conetxt book: {wrapped_all_context_book} \n')

wrapped_all_context_summaries = textwrap.fill(all_context_summaries, width=120)
print(f' context summaries: {wrapped_all_context_summaries} \n')

wrapped_result = textwrap.fill(result['text'], width=120)
print(f' answer: {wrapped_result}')

 conetxt book: without further clues.       Neither Neville nor Hermione showed the slightest interest in what lay underneath the dog
and the trapdoor. All Neville cared about was never going near the dog again.       Hermione was now refusing to speak
to Harry and Ron, but she was such a bossy know-it-all that they saw this as an added bonus. All they really wanted now
was a way of getting back at Malfoy, and to their great delight, just such a thing arrived in the mail about a week
later.       As the owls flooded into the Great Hall as usual, everyone’s attention was caught at once by a long, thin
package carried by six large screech owls. Harry was just as interested as everyone else to see what was in this large
parcel, and was amazed when the owls soared down and dropped it right in front of him, knocking his bacon to the floor.
They had hardly fluttered out of the way when another owl dropped a letter on top of the parcel. toads went outta
fashion years ago, yeh’d be laughed at 

## Model Evaluation


In [32]:
questions = [
    "Who gave Harry Potter his first broomstick?",
    "What is the name of the three-headed dog guarding the Sorcerer's Stone?",
    "Which house did the Sorting Hat initially consider for Harry?",
    "What is the name of Harry's owl?"
]
#     "How did Harry and his friends get past Fluffy?",
#     "What is the Mirror of Erised?",
#     "Who tried to steal the Sorcerer's Stone?",
#     "How did Harry defeat Quirrell/Voldemort?",
#     "What is Harry's parent's secret weapon against Voldemort?",
# ]

ground_truth_answers = [
    "Professor McGonagall",
    "Fluffy",
    "Slytherin",
    "Hedwig",
    # "They played music to put Fluffy to sleep.",
    # "A magical mirror that shows the 'deepest, most desperate desire of our hearts.'",
    # "Professor Quirrell, possessed by Voldemort",
    # "Harry's mother's love protected him, causing Quirrell/Voldemort pain when they touched him.",
    # "Love",
]

In [33]:
generated_answers = []
retrieved_documents = []
for question in questions:
    result, all_context_book, all_context_summaries = answer_question_pipline(question, retriever, chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm)
    generated_answers.append(result['text'])
    retrieved_documents.append(all_context_book + all_context_summaries)


Original Output: ```python
[
    "Which character presents Harry Potter with a broomstick for the first time?",
    "In which part of the book does Harry Potter receive his initial broomstick?",
    "What are the circumstances surrounding Harry Potter obtaining his first broomstick?"
]
```
Original Output: ```python
[
    "What creature is guarding an important object in the story?",
    "How many heads does the guard creature have?",
    "What is the name given to the creature with multiple heads?"
]
```
Original Output: ```python
[
    "What are the characteristics of the different houses mentioned by the Sorting Hat?",
    "How does the Sorting Hat determine the house for new students?",
    "What interactions or discussions does Harry have with the Sorting Hat about house placement?"
]
```
Original Output: ```python
[
    "Which character in the book has an owl as a pet?",
    "What are the names of any pets mentioned in the story?",
    "How is Harry's pet owl referred to or addre

In [119]:
print(retrieved_documents)



In [34]:
generated_answers

['Harry Potter received his first broomstick from Professor McGonagall. This is supported by the context provided from Chapter Ten of "Harry Potter and the Sorcerer\'s Stone," which states: "Harry receives a Nimbus Two Thousand broomstick as a gift from Professor McGonagall and is thrilled."',
 'The name of the three-headed dog guarding the Sorcerer\'s Stone is Fluffy. This is supported by the context in Chapter Sixteen, where it is mentioned: "Harry becomes suspicious and rushes to confront Hagrid. Hagrid inadvertently reveals that he told the stranger how to get past Fluffy, the three-headed dog guarding the Sorcerer\'s Stone."',
 'The Sorting Hat initially considered Slytherin for Harry. This is supported by the context in Chapter Seven, where the Sorting Hat\'s song mentions, "Or perhaps in Slytherin / You’ll make your real friends, / Those cunning folk use any means / To achieve their ends." This suggests that the Sorting Hat saw potential in Harry for Slytherin\'s defining traits

In [36]:


# Prepare data for Ragas evaluation
data_samples = {
    'question': questions,  # Replace with your list of questions
    'answer': generated_answers,  # Replace with your list of generated answers
    'contexts': retrieved_documents,  # Your retrieved_documents list
    'ground_truth': ground_truth_answers  # Replace with your list of ground truth answers
}

# Convert contexts to list of strings (if necessary)
data_samples['contexts'] = [list(context) for context in data_samples['contexts']]

dataset = Dataset.from_dict(data_samples)

# Evaluate using Ragas with the specified metrics
metrics = [
    answer_correctness,
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_similarity
]
llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview", max_tokens=4000)
score = evaluate(dataset, metrics=metrics, llm=llm)

# Print results and explanations
results_df = score.to_pandas()
print(results_df)


Evaluating:  30%|███       | 6/20 [00:09<00:24,  1.78s/it]No statements were generated from the answer.
Evaluating: 100%|██████████| 20/20 [00:49<00:00,  2.46s/it]


                                            question  \
0        Who gave Harry Potter his first broomstick?   
1  What is the name of the three-headed dog guard...   
2  Which house did the Sorting Hat initially cons...   
3                   What is the name of Harry's owl?   

                                              answer  \
0  Harry Potter received his first broomstick fro...   
1  The name of the three-headed dog guarding the ...   
2  The Sorting Hat initially considered Slytherin...   
3  The name of Harry's owl is not provided in the...   

                                            contexts          ground_truth  \
0  [H, P,  , 1,  , -,  , H, a, r, r, y,  , P, o, ...  Professor McGonagall   
1  [c, l, i, m, b, e, d,  , b, a, c, k,  , i, n, ...                Fluffy   
2  [S, l, y, t, h, e, r, i, n, .,  , E, a, c, h, ...             Slytherin   
3  [w, i, t, h, o, u, t,  , f, u, r, t, h, e, r, ...                Hedwig   

   answer_correctness  faithfulness  answer_rel

In [42]:
import pandas as pd
def analyse_metric_results(results_df):
    for metric_name, metric_value in results_df.items():
        print(f"\n**{metric_name.upper()}**")

        # Extract the numerical value from the Series object
        if isinstance(metric_value, pd.Series):
            metric_value = metric_value.values[0]  # Assuming the value is at index 0

        if metric_name == "faithfulness":
            print("Measures how well the generated answer is supported by the retrieved documents.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better faithfulness.
        elif metric_name == "answer_relevancy":
            print("Measures how relevant the generated answer is to the question.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better relevance.
        elif metric_name == "context_precision":
            print("Measures the proportion of retrieved documents that are actually relevant.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better precision (avoiding irrelevant documents).
        elif metric_name == "context_relevancy":
            print("Measures how relevant the retrieved documents are to the question.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better relevance of retrieved documents.
        elif metric_name == "context_recall":
            print("Measures the proportion of relevant documents that are successfully retrieved.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better recall (finding all relevant documents).
        elif metric_name == "context_entity_recall":
            print("Measures the proportion of relevant entities mentioned in the question that are also found in the retrieved documents.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better recall of relevant entities.
        elif metric_name == "answer_similarity":
            print("Measures the semantic similarity between the generated answer and the ground truth answer.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates closer semantic meaning between the answers.
        elif metric_name == "answer_correctness":
            print("Measures whether the generated answer is factually correct.")
            print(f"Score: {metric_value:.4f}")
            # Interpretation: Higher score indicates better correctness.

In [43]:
analyse_metric_results(results_df)


**QUESTION**

**ANSWER**

**CONTEXTS**

**GROUND_TRUTH**

**ANSWER_CORRECTNESS**
Measures whether the generated answer is factually correct.
Score: 0.5879

**FAITHFULNESS**
Measures how well the generated answer is supported by the retrieved documents.
Score: 1.0000

**ANSWER_RELEVANCY**
Measures how relevant the generated answer is to the question.
Score: 1.0000

**CONTEXT_RECALL**
Measures the proportion of relevant documents that are successfully retrieved.
Score: 1.0000

**ANSWER_SIMILARITY**
Measures the semantic similarity between the generated answer and the ground truth answer.
Score: 0.8516


In [None]:
chat_with_data(retriever,chapter_summaries_retriever, answer_from_context_llm_chain, multi_query_retriver_llm)

You can start chatting with me about Harry Potter. Type 'exit' to stop.
