In [1]:
#Imports
import config 
import shutil
import requests
from urllib.parse import urlparse
import config
import sys
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from langchain_core.language_models import BaseChatModel
import json
import datasets
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models import ChatHuggingFace
import os
import random
import time
from datasets import Dataset, DatasetDict

pdfPath = config.pdfPath
hf_token = config.hf_token
model_id = config.model_id
title = config.title
file_path = config.file_path
##ensuring the config.py variables are set
if pdfPath is None:
    raise ValueError("pdfPath is None. Please set the  pdf path in config.py.")

if hf_token is None:
    raise ValueError("hf_token is None. Please set the huggingFace token in config.py.")

if model_id is None:
    raise ValueError("model_id is None. Please set the model_id in config.py.")

if title is None:
    raise ValueError("title is None. Please set the title of the Pdf in config.py.")

if file_path is None:
    raise ValueError("file_path is None. Please set the local file_path in config.py.")


os.environ['HUGGINGFACEHUB_API_TOKEN'] = hf_token

#loading the hugginFace LLM
llm = HuggingFaceHub(
    repo_id= model_id,
    task="text-generation",
    # huggingfacehub_api_token = hf_token,
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)
chat_model = ChatHuggingFace(llm=llm, token = False)

  warn_deprecated(
  warn_deprecated(
                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.


1- Loading a Text File

In [2]:
loader = PyPDFLoader(pdfPath) #I am Loading Pdf Here

2- Splitting the Text File Into Splits

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,  
        chunk_overlap=200,
        add_start_index=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )
try:
    langchain_docs = loader.load_and_split(text_splitter=text_splitter)
except Exception as e:
    print("An error occurred:", e)

In [4]:
#Logging the Number of the Splits Generated from the Text File Uploaded...
print(f"Number of Splits are : {len(langchain_docs)}")

Number of Splits are : 37


3- Splits: Each Split will serve as a context to the LLM for Developing Question and Answer

In [5]:
#Selecting a Random Split which serves as a context for developing QNA
context = random.choice(langchain_docs)
print(context.page_content)

12
January 2023
Motor Vehicle Accident Claims Fund
If you don’t have private medical insurance 
and you have exhausted all of the benefits 
available to you under the Section B 
portion of your automobile policy, you 
may be able to obtain coverage for 
further medical expenses by making an 
application to the Motor Vehicle Accident 
Claims Fund. This is a fund set up by the 
Government of Alberta to help people 
injured in motor vehicle accidents who 
need medical treatment but don’t have 
any other form of insurance coverage for 
medical expenses available to them.
If the accident was caused by someone 
else’s negligence, any expenses not 
covered by one or more of the sources 
noted above can be claimed in a personal 
injury action brought against the at-fault 
party or parties.
3. WAGE REPLACEMENT/DISABILITY 
BENEFITS
Not being able to work for a period of 
time because of injuries sustained in 
an accident, can cause you extreme 
financial hardship. Fortunately, there 
are several

4- Question Answer Generation LLM Agent (Agent1)

In [6]:
from langchain.prompts import ChatPromptTemplate
QA_generation_prompt = """
Your task is to develop factoid questions and  answers from  a given context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)



Now here is the context.

Context: {context}\n
Output:::"""

QA_generation_prompt = ChatPromptTemplate.from_template(QA_generation_prompt)
QA_generation_agent = QA_generation_prompt | chat_model
output_QA_couple = QA_generation_agent.invoke({"context": context.page_content}).content
# print("Output from the LLM to the INPUT prompt-----------------")
# print(output_QA_couple)
question = output_QA_couple.split("Factoid question: ")[2].split("Answer: ")[0]
answer = output_QA_couple.split("Answer: ")[2]
index = answer.find("Factoid question:")
answer = answer[:index].strip()
datapoint = { "context": context.page_content,
             "question": question,
             "answer": answer,
             "source_doc": context.metadata["source"],
            }
print(f"Context: {context.page_content}\n\n")
print(f"question Generated : {question}")
print(f"answer Generated : {answer}")

Context: 12
January 2023
Motor Vehicle Accident Claims Fund
If you don’t have private medical insurance 
and you have exhausted all of the benefits 
available to you under the Section B 
portion of your automobile policy, you 
may be able to obtain coverage for 
further medical expenses by making an 
application to the Motor Vehicle Accident 
Claims Fund. This is a fund set up by the 
Government of Alberta to help people 
injured in motor vehicle accidents who 
need medical treatment but don’t have 
any other form of insurance coverage for 
medical expenses available to them.
If the accident was caused by someone 
else’s negligence, any expenses not 
covered by one or more of the sources 
noted above can be claimed in a personal 
injury action brought against the at-fault 
party or parties.
3. WAGE REPLACEMENT/DISABILITY 
BENEFITS
Not being able to work for a period of 
time because of injuries sustained in 
an accident, can cause you extreme 
financial hardship. Fortunately, there 
ar

5- groundedness Score Generation LLM Agent

In [7]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, 
and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_groundedness_critique_prompt = ChatPromptTemplate.from_template(
    question_groundedness_critique_prompt
)
question_groundedness_critique_agent = question_groundedness_critique_prompt | chat_model

# Critique the generated QA couple
question_groundedness_evaluation = question_groundedness_critique_agent.invoke(
    {"context": context.page_content, "question": question}
).content
##Extracting the Evaluation and Evaluation Score, this can cause error 
groundedness_score = int(question_groundedness_evaluation.split("Total rating: ")[2][0])
groundedness_eval = question_groundedness_evaluation.split("Total rating: ")[1].split("Evaluation: ")[1]
datapoint.update(
            {
                "groundedness_score": groundedness_score,
                "groundedness_eval": groundedness_eval,
            }
        )

print(f"Context: {context.page_content}\n\n")
print(f"question: {question}\n\n")
print(f"groundedness_score: {groundedness_score}")
print(f"groundedness_eval: {groundedness_eval}")

Context: 12
January 2023
Motor Vehicle Accident Claims Fund
If you don’t have private medical insurance 
and you have exhausted all of the benefits 
available to you under the Section B 
portion of your automobile policy, you 
may be able to obtain coverage for 
further medical expenses by making an 
application to the Motor Vehicle Accident 
Claims Fund. This is a fund set up by the 
Government of Alberta to help people 
injured in motor vehicle accidents who 
need medical treatment but don’t have 
any other form of insurance coverage for 
medical expenses available to them.
If the accident was caused by someone 
else’s negligence, any expenses not 
covered by one or more of the sources 
noted above can be claimed in a personal 
injury action brought against the at-fault 
party or parties.
3. WAGE REPLACEMENT/DISABILITY 
BENEFITS
Not being able to work for a period of 
time because of injuries sustained in 
an accident, can cause you extreme 
financial hardship. Fortunately, there 
ar

6 - Relevance Score Generation LLM Agent

In [8]:
# step 6----------
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to  {title}\n
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here is the question.

Question: {question}\n
Answer::: """
question_relevance_critique_prompt = ChatPromptTemplate.from_template(
    question_relevance_critique_prompt
)
question_relevance_critique_agent = question_relevance_critique_prompt | chat_model



question_relevance_evaluation = question_relevance_critique_agent.invoke(
        {"title": title, "question": question}
    ).content


#extracting the relevance_score and eval
relevance_score = int(question_relevance_evaluation.split("Total rating: ")[2][0])
relevance_eval = question_relevance_evaluation.split("Total rating: ")[1].split("Evaluation: ")[1]
datapoint.update(
            {

                "relevance_score": relevance_score,
                "relevance_eval": relevance_eval,
            }
        )
print(f"question: {question}")
print(f"relevance_score: {relevance_score}")
print(f"relevance_eval : {relevance_eval}")

question: Who can provide compensation for loss of income resulting from injuries sustained in a motor vehicle accident?

relevance_score: 5
relevance_eval : This question is useful for the Car Accident Legal Guide as it addresses an important aspect of car accident claims - compensation for loss of income. The answer explains who is generally responsible for providing such compensation and offers additional information about potential sources of compensation. This knowledge can help users understand their options and potential next steps in pursuing a claim.




7- Displaying dataPoint Generated.......

In [9]:
ls = [datapoint]
df = pd.DataFrame(ls)
display(df)

Unnamed: 0,context,question,answer,source_doc,groundedness_score,groundedness_eval,relevance_score,relevance_eval
0,12\nJanuary 2023\nMotor Vehicle Accident Claim...,Who can provide compensation for loss of incom...,The first place to look for coverage for any l...,law.pdf,5,"Based on the provided context, the answer to t...",5,This question is useful for the Car Accident L...


In [10]:
#save Locally
dataset = Dataset.from_pandas(df)
dataset.push_to_hub("QAGeniusPresentation")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/555 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Ubaidbhat/QAGeniusPresentation/commit/788496406fe686de3b6c82c16be25ba9ef2ba8ca', commit_message='Upload dataset', commit_description='', oid='788496406fe686de3b6c82c16be25ba9ef2ba8ca', pr_url=None, pr_revision=None, pr_num=None)