In [1]:
## Getting to Main directory
import os
os.chdir("../")

In [2]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
gemini_api_key=os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))



### Data ingestion - Taking pdf documents and Cleaning and Transforming  Data into vector index

In [4]:
from langchain_community.document_loaders import DirectoryLoader

In [75]:
loader = DirectoryLoader('Data2', glob="**/*.pdf")

In [61]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Data/[Data] Think Like a Data Scientist (2017).pdf")
pages = loader.load_and_split()

incorrect startxref pointer(1)


In [9]:
# docs = loader.load()

In [10]:
# pages

In [80]:
len(pages)

325

In [81]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
# text_chunks = text_splitter.split_documents(docs)
text_chunks = text_splitter.split_documents(pages)

In [82]:
len(text_chunks)

1042

In [83]:
# Clean up our Documents' content
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in text_chunks: 
    cleaned_text = clean_up_text(d.page_content)
    d.page_content = cleaned_text
    cleaned_docs.append(d)

In [85]:
text_chunks[2].page_content

'Think Likea Data ScientistTACKLE THE DATA SCIENCE PROCESS STEP-BY-STEPBRIAN GODSEYMANNINGSHELTER ISLAND'

In [86]:
cleaned_docs[0].metadata

{'source': 'Data/[Data] Think Like a Data Scientist (2017).pdf', 'page': 0}

### Configuring Gemini model and GeminiEmbedding

In [87]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings


In [88]:
genai.configure(api_key=gemini_api_key)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# Setting Tempreture to 0.3 for getting low risk results

model = ChatGoogleGenerativeAI(model="gemini-1.0-pro-latest",api_key=gemini_api_key,temperature=0.3)

### Using FAISS as our vector database taking the index to save them locally

In [89]:
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index")

### Applying proper prompts to get best output

In [90]:
prompt_template = """
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
Context:\n {context}?\n
Question: \n{question}\n

Answer:
"""

# model = ChatGoogleGenerativeAI(model="gemini-pro",
#                             temperature=0.3)

prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])

prompt
# return chain

PromptTemplate(input_variables=['context', 'question'], template='\nAnswer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in\nprovided context just say, "answer is not available in the context", don\'t provide the wrong answer\n\n\nContext:\n {context}?\n\nQuestion: \n{question}\n\n\nAnswer:\n')

In [91]:
from langchain.prompts.chat import ChatPromptTemplate
qa_prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
qa_prompt


ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='\nAnswer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in\nprovided context just say, "answer is not available in the context", don\'t provide the wrong answer\n\n\nContext:\n {context}?\n\nQuestion: \n{question}\n\n\nAnswer:\n'))])

In [92]:
chain = load_qa_chain(llm=model, chain_type="stuff", verbose=True, prompt=qa_prompt)


In [72]:
user_question = "Tell me about the phrase ` Priorities: knowledge first, technology second, opinions third` "

In [93]:
new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)


In [94]:
docs[0]

Document(metadata={'source': 'Data/[Data] Think Like a Data Scientist (2017).pdf', 'page': 35}, page_content='12 CHAPTER 1Philosophies of data science1.6 Priorities: knowledge first, technology second, opinions thirdThis section title is an adage of mine. I use it to help settle disputes in the never-endingbattle between the various concerns of ever y data science project—for example, software versus statistics, changing business need versus project timeli ne, data quality versus accuracy of results. Each individual concern pushes and pulls on the others as aproject progresses, and we’re forced to make choices whenever two of them disagreeon a course of action. I’ve developed a simple framework to help with that. Knowledge, technology, and opinions are typically what you have at the beginningof any project; they are the three things that turn data into answers. Knowledge is whatyou know for a fact. Technology is the set of tools you have at your disposal. Opinions arethose little almos

In [95]:
model

ChatGoogleGenerativeAI(model='models/gemini-1.0-pro-latest', google_api_key=SecretStr('**********'), temperature=0.3, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002B6C3FCCAF0>, async_client=<google.ai.generativelanguage_v1beta.services.generative_service.async_client.GenerativeServiceAsyncClient object at 0x000002B6C3FDC220>, default_metadata=())

In [96]:
response = chain(
        {"input_documents":docs, "question": user_question}
        , return_only_outputs=True)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: 
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answer


Context:
 12 CHAPTER 1Philosophies of data science1.6 Priorities: knowledge first, technology second, opinions thirdThis section title is an adage of mine. I use it to help settle disputes in the never-endingbattle between the various concerns of ever y data science project—for example, software versus statistics, changing business need versus project timeli ne, data quality versus accuracy of results. Each individual concern pushes and pulls on the others as aproject progresses, and we’re forced to make choices whenever two of them disagreeon a course of action. I’ve developed a simple framework to help with that.

In [97]:
print(response["output_text"])

This phrase is an adage that helps settle disputes in the never-ending battle between the various concerns of every data science project. It establishes a hierarchy for thought processes so that less-important things don't steamroll more-important ones. In practice, the hierarchy looks like this:

1. Knowledge first: Get to know your problem, your data, your approach, and your goal before you do anything else, and keep those at the forefront of your mind.
2. Technology second: Software is a tool that serves you. It both enables and constrains you. It shouldn't dictate your approach to the problem except in extenuating circumstances.
3. Opinions third: Opinions, intuition, and wishful thinking are to be used only as guides toward theories that can be proven correct and not as the focus of any project.


In [42]:
eval_questions = [
    "How many baby birds did the mother bird have?",
    "What happened to the mother bird during the storm?",
    "Where did the baby birds end up after the storm?",
    "What happened to one of the baby birds near a cave?",
    "Where did the other baby bird land?",
    "Who came to the forest one day?",
    "What did the king see under a tree near a cave?",
    "What did the king hear from the big, brown bird?",
    "Where did the king go after leaving the cave?",
    "Who welcomed the king at the ashram?",
    "What did the rishi say about the two birds?",
    "What did the king tell the rishi about the birds?",
    "What did the rishi say to the king about the birds?",
    "What did the rishi compare the birds' behavior to?",
    "Which moral best summarizes the story?"
]

eval_answers = [
    "The mother bird had two baby birds.",
    "The mother bird was killed by a heavy branch during the storm.",
    "The baby birds ended up separated from each other after the storm.",
    "One of the baby birds came down near a cave where a gang of robbers lived.",
    "The other baby bird landed outside a rishi’s ashram a little distance away.",
    "The king of the country came to the forest one day.",
    "The king saw a big, brown bird under a tree near a cave.",
    "The king heard the big, brown bird urging someone to take his jewels and horse quickly.",
    "After leaving the cave, the king came to a clearing that looked like an ashram.",
    "The king was welcomed at the ashram by a big, brown bird.",
    "The rishi said that one bird had made friends with robbers while the other welcomed people to the ashram.",
    "The king told the rishi the story of the two birds and their different behaviors.",
    "The rishi said that one is known by the company one keeps.",
    "The rishi compared the birds' behavior to the influence of companionship.",
    "The moral 'One is known by the company one keeps' best summarizes the story."
]


In [44]:
examples = [
    {"query": q, "ground_truth": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
]

In [45]:
examples

[{'query': 'How many baby birds did the mother bird have?',
  'ground_truth': ['The mother bird had two baby birds.']},
 {'query': 'What happened to the mother bird during the storm?',
  'ground_truth': ['The mother bird was killed by a heavy branch during the storm.']},
 {'query': 'Where did the baby birds end up after the storm?',
  'ground_truth': ['The baby birds ended up separated from each other after the storm.']},
 {'query': 'What happened to one of the baby birds near a cave?',
  'ground_truth': ['One of the baby birds came down near a cave where a gang of robbers lived.']},
 {'query': 'Where did the other baby bird land?',
  'ground_truth': ['The other baby bird landed outside a rishi’s ashram a little distance away.']},
 {'query': 'Who came to the forest one day?',
  'ground_truth': ['The king of the country came to the forest one day.']},
 {'query': 'What did the king see under a tree near a cave?',
  'ground_truth': ['The king saw a big, brown bird under a tree near a cave

In [46]:
user_question = eval_questions[5]

In [47]:
user_question

'Who came to the forest one day?'

In [48]:
new_db = FAISS.load_local("faiss_index",embeddings,allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)

docs

[Document(page_content='would the crocodile tell his wife?Rationalised 2023-24', metadata={'source': 'Data\\fepw105.pdf'}),
 Document(page_content='Rationalised 2023-24', metadata={'source': 'Data\\fepw106.pdf'}),
 Document(page_content='Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24', metadata={'source': 'Data\\fepw104.pdf'}),
 Document(page_content='Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood near a cave. Suddenly he heard a voice cry out, “Quick! Hurry up! There’s someone under the tree. Come and take his jewels and his horse. Hurry, or else he’ll slip away.” Th

In [49]:
response = chain(
        {"input_documents":docs, "question": user_question})



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: 
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answer


Context:
 would the crocodile tell his wife?Rationalised 2023-24

Rationalised 2023-24

Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24

Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood near a cave. 

In [50]:
response["input_documents"][0].page_content

'would the crocodile tell his wife?Rationalised 2023-24'

In [51]:
print(response["output_text"])

## Who Came to the Forest? 

**The king of the country** came to the forest one day to hunt. 



In [52]:

import nest_asyncio

nest_asyncio.apply()


In [53]:
response

{'input_documents': [Document(page_content='would the crocodile tell his wife?Rationalised 2023-24', metadata={'source': 'Data\\fepw105.pdf'}),
  Document(page_content='Rationalised 2023-24', metadata={'source': 'Data\\fepw106.pdf'}),
  Document(page_content='Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24', metadata={'source': 'Data\\fepw104.pdf'}),
  Document(page_content='Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood near a cave. Suddenly he heard a voice cry out, “Quick! Hurry up! There’s someone under the tree. Come and take his jewels and his horse. Hurry, or el

## RAG model Evaluation using RAGAS; Currently RAGAS is not supporting Gemini, but I do hope it would be available in future. Regardless of that, Integrating Open AI with RAGAS won't be a problem if we already have access to it's api key

In [55]:
from ragas.langchain.evalchain  import RagasEvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# create evaluation chains
# faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
# answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
# context_rel_chain = RagasEvaluatorChain(metric=context_precision)
# context_recall_chain = RagasEvaluatorChain(metric=context_recall)

In [56]:
import pandas as pd

In [57]:
# Create a DataFrame
test_df = pd.DataFrame({
    'question': eval_questions,
    'ground_truth': eval_answers
})

# Print the DataFrame
print(test_df)

                                             question  \
0       How many baby birds did the mother bird have?   
1   What happened to the mother bird during the st...   
2    Where did the baby birds end up after the storm?   
3   What happened to one of the baby birds near a ...   
4                 Where did the other baby bird land?   
5                     Who came to the forest one day?   
6     What did the king see under a tree near a cave?   
7    What did the king hear from the big, brown bird?   
8       Where did the king go after leaving the cave?   
9                Who welcomed the king at the ashram?   
10        What did the rishi say about the two birds?   
11  What did the king tell the rishi about the birds?   
12  What did the rishi say to the king about the b...   
13  What did the rishi compare the birds' behavior...   
14             Which moral best summarizes the story?   

                                         ground_truth  
0                 The mother bi

In [58]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [59]:
test_questions = test_questions[:2]
test_groundtruths = test_groundtruths[:2]

In [60]:
test_questions

['How many baby birds did the mother bird have?',
 'What happened to the mother bird during the storm?']

In [61]:
answers = []
contexts = []

for question in test_questions:
      new_db = FAISS.load_local("faiss_index",embeddings,allow_dangerous_deserialization=True)
      docs = new_db.similarity_search(user_question)

      response = chain(
            {"input_documents":docs, "question": question})
      # response = retrieval_augmented_qa_chain.invoke({"question" : question})
      answers.append(response["output_text"])
      contexts.append([context.page_content for context in response["input_documents"]])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: 
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answer


Context:
 would the crocodile tell his wife?Rationalised 2023-24

Rationalised 2023-24

Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24

Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood near a cave. 

In [62]:
test_groundtruths

['The mother bird had two baby birds.',
 'The mother bird was killed by a heavy branch during the storm.']

In [67]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [80]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truths" : test_groundtruths
})

In [88]:
from datasets import Dataset, Features, Sequence, Value

# Convert each ground truth string into a list containing that string
test_groundtruths_list = [[gt] for gt in test_groundtruths]

# Define the features with the correct type for "ground_truths"
features = Features({
    "question": Value("string"),
    "answer": Value("string"),
    "contexts": Value("string"),
    "ground_truths": Sequence(Sequence(Value("string")))
})

# Create the dataset
response_dataset = Dataset.from_dict({
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": test_groundtruths_list
})

# Now, you should be able to use the dataset without encountering the ValueError


In [89]:
response_dataset[0]

{'question': 'How many baby birds did the mother bird have?',
 'answer': '## Analyzing the Context for Baby Bird Count:\n\nThe provided text describes a story involving a king, a deer, and birds but **does not specify the number of baby birds the mother bird had**. Therefore, based on the context provided, the answer is: \n\n**Answer is not available in the context** \n',
 'contexts': ['would the crocodile tell his wife?Rationalised 2023-24',
  'Rationalised 2023-24',
  'Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24',
  'Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood

In [90]:
response_dataset[0]

{'question': 'How many baby birds did the mother bird have?',
 'answer': '## Analyzing the Context for Baby Bird Count:\n\nThe provided text describes a story involving a king, a deer, and birds but **does not specify the number of baby birds the mother bird had**. Therefore, based on the context provided, the answer is: \n\n**Answer is not available in the context** \n',
 'contexts': ['would the crocodile tell his wife?Rationalised 2023-24',
  'Rationalised 2023-24',
  'Are you interested in music? Do you like classical music? Name a few distinguished Indian musicians.Rationalised 2023-24',
  'Days passed and the baby birds became big birds. One day, the king of the country came to the forest to hunt. He saw a deer and rode after it. It ran deep into the forest followed by the king. Soon the king lost his way and didn’t know where he was.He rode on for a long time till he came to the other side of the forest. Very tired by now, he got off his horse and sat down under a tree that stood

In [None]:

advanced_retrieval_results = evaluate(response_dataset, metrics)