In [42]:
#Imports
import config 
import shutil
import requests
from urllib.parse import urlparse
import config
import sys
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from langchain_core.language_models import BaseChatModel
import json
import datasets
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models import ChatHuggingFace
import os
import random
import time

In [43]:
pdfPath = config.pdfPath
hf_token = config.hf_token
model_id = config.model_id
title = config.title
file_path = config.file_path
##ensuring the config.py variables are set
if pdfPath is None:
    raise ValueError("pdfPath is None. Please set the  pdf path in config.py.")

if hf_token is None:
    raise ValueError("hf_token is None. Please set the huggingFace token in config.py.")

if model_id is None:
    raise ValueError("model_id is None. Please set the model_id in config.py.")

if title is None:
    raise ValueError("title is None. Please set the title of the Pdf in config.py.")

if file_path is None:
    raise ValueError("file_path is None. Please set the local file_path in config.py.")


os.environ['HUGGINGFACEHUB_API_TOKEN'] = hf_token

In [44]:
#loading the hugginFace LLM
llm = HuggingFaceHub(
    repo_id= model_id,
    task="text-generation",
    # huggingfacehub_api_token = hf_token,
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)
chat_model = ChatHuggingFace(llm=llm, token = False)

                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.


In [46]:
##1- Text File Loading
loader = PyPDFLoader(pdfPath)

In [None]:
##STEP 1 and 2
try:
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,  
        chunk_overlap=200,
        add_start_index=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )
    langchain_docs = loader.load_and_split(text_splitter=text_splitter)
except Exception as e:
    print("An error occurred:", e)

In [8]:
##step 3 ------> splits
print(len(langchain_docs))

37


In [18]:
context = random.choice(langchain_docs)
print(context)

page_content='5\nJanuary 2023\nThird party liability coverage is mandatory for all drivers in Alberta. Each \nautomobile must be insured for third party liability to a minimum of $200,000.\nWhat is Third Party Liability Insurance?\nThird party liability insurance is designed to protect you (the insured) from financial \nruin should you be responsible for an accident, and cause injury to others or \ndamage to their property. Liability coverage is mandatory to ensure that accident \nvictims have access to funds to compensate them for losses suffered in accidents. \nHow it Works\nWhen you buy third party liability insurance, if you are responsible for an accident \nthat causes loss (personal injury or property damage), your insurance company \nagrees to defend you against claims and to pay amounts you might owe to the \nother party, up to the limit of the policy. Many people choose to top up their \ninsurance policies above the provincial minimum to protect themselves from \ndamages claim

In [21]:
##step 4 ---------> 
##Question Answer Generation LLM Agent
from langchain.prompts import ChatPromptTemplate
QA_generation_prompt = """
Your task is to develop factoid questions and  answers from  a given context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)



Now here is the context.

Context: {context}\n
Output:::"""

QA_generation_prompt = ChatPromptTemplate.from_template(QA_generation_prompt)
QA_generation_agent = QA_generation_prompt | chat_model
output_QA_couple = QA_generation_agent.invoke({"context": context.page_content}).content
print("Output from the LLM to the INPUT prompt-----------------")
print(output_QA_couple)


<s>[INST] 
Your task is to develop factoid questions and  answers from  a given context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)



Now here is the context.

Context: page_content='5\nJanuary 2023\nThird party liability coverage is mandatory for all drivers in Alberta. Each \nautomobile must be insured for third party liability to a minimum of $200,000.\nWhat is Third Party Liability Insurance?\nThird party liability insurance is designed to protect you (the insured) from financial \nruin should you be responsible for an accident, and cause injury to others or \ndam

In [32]:
#Exatracting the question and Answer from the Model Output
question = output_QA_couple.split("Factoid question: ")[2].split("Answer: ")[0]
answer = output_QA_couple.split("Answer: ")[2]
datapoint ={ "context": context.page_content,
             "question": question,
             "answer": answer,
             "source_doc": context.metadata["source"],
            }
print(f"question Generated : {question}")
print(f"answer Generated : {answer}")

question Generated : What is the minimum third party liability coverage required for drivers in Alberta?

answer Generated : The minimum third party liability coverage required for drivers in Alberta is $200,000.


In [23]:
##STEP 5--------------
## groundedness_agent
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_groundedness_critique_prompt = ChatPromptTemplate.from_template(
    question_groundedness_critique_prompt
)
question_groundedness_critique_agent = question_groundedness_critique_prompt | chat_model

# Critique the generated QA couple
question_groundedness_evaluation = question_groundedness_critique_agent.invoke(
    {"context": context.page_content, "question": question}
).content
print(question_groundedness_evaluation)

<s>[INST] 
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here are the question and context.

Question: What is the minimum third party liability coverage required for drivers in Alberta?


Context: 5
January 2023
Third party liability coverage is mandatory for all drivers in Alberta. Each 
automobile must be insured for third party liability to a minimum of $200,000.
What is Third Party Liability Insurance?
Third party liability insurance is designed to protect you (the insured) from financial 
ruin should you be responsible for an accident, 

In [33]:
##Extracting the Evaluation and Evaluation Score, this can cause error 
groundedness_score = int(question_groundedness_evaluation.split("Total rating: ")[2][0])
groundedness_eval = question_groundedness_evaluation.split("Total rating: ")[1].split(
            "Evaluation: "
    )[1]
datapoint.update(
            {
                "groundedness_score": groundedness_score,
                "groundedness_eval": groundedness_eval,
            }
        )
print(f"groundedness_score: {groundedness_score}")
print(f"groundedness_eval: {groundedness_eval}")

groundedness_score: 5
groundedness_eval: (The context clearly and unambiguously answers the question.)




In [35]:
# step 6----------
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to  {title}\n
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here is the question.

Question: {question}\n
Answer::: """
question_relevance_critique_prompt = ChatPromptTemplate.from_template(
    question_relevance_critique_prompt
)
question_relevance_critique_agent = question_relevance_critique_prompt | chat_model
question_relevance_evaluation = question_relevance_critique_agent.invoke(
        {"title": title, "question": question}
    ).content
print(question_relevance_evaluation)

<s>[INST] 
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to  Car Accident Legal Guide

Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating)
Total rating: (your rating)

Now here is the question.

Question: What is the minimum third party liability coverage required for drivers in Alberta?


Answer:::  [/INST] Evaluation: This question is extremely useful for a Car Accident Legal Guide as it relates to the legal requirements for drivers in Alberta. Knowing the minimum third-party liability coverage required by law is essential information for drivers to ensure they are in compliance with the law and to understand the level of coverage they have in the event of an accident. This information is also important for victims of car accidents to det

In [36]:
#extracting the relevance_score and eval
relevance_score = int(question_relevance_evaluation.split("Total rating: ")[2][0])
relevance_eval = question_relevance_evaluation.split("Total rating: ")[1].split(
            "Evaluation: "
)[1]
datapoint.update(
            {

                "relevance_score": relevance_score,
                "relevance_eval": relevance_eval,
            }
        )
print(f"relevance_score: {relevance_score}")
print(f"relevance_eval : {relevance_eval}")

relevance_score: 5
relevance_eval : This question is extremely useful for a Car Accident Legal Guide as it relates to the legal requirements for drivers in Alberta. Knowing the minimum third-party liability coverage required by law is essential information for drivers to ensure they are in compliance with the law and to understand the level of coverage they have in the event of an accident. This information is also important for victims of car accidents to determine if the at-fault driver has sufficient insurance coverage to compensate them for their injuries and damages.




In [38]:
generated_questions = pd.DataFrame.from_dict(list(datapoint))

In [41]:
generated_questions

Unnamed: 0,0
0,context
1,question
2,answer
3,source_doc
4,groundedness_score
5,groundedness_eval
6,relevance_score
7,relevance_eval
