Ragas

In [4]:
from ragas import SingleTurnSample,EvaluationDataset,evaluate
from ragas.metrics import (
    Faithfulness,ResponseRelevancy,LLMContextPrecisionWithReference,LLMContextRecall,NoiseSensitivity,ContextEntityRecall
)

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [25]:
import asyncio
import nest_asyncio

def run_async_function(cornjob):
    """Helper Function To Run Async Functions in Jupyter Notebooks"""
    try:
        loop=asyncio.get_event_loop()
        if loop.is_running():
            nest_asyncio.apply()
            return loop.run_until_complete(cornjob)
        else:
            return asyncio.run(cornjob)
    except RuntimeError:
        return asyncio.run(cornjob)

Generator

Faithfullness

In [38]:
test_context=[
    "LLMs are trained on large text datasets and do not have consciousness or personal beliefs."
]

test_response="No, LLMs do not have personal beliefs and only generate text based on learned patterns."



Generate Claims

In [8]:
prompt_template=ChatPromptTemplate.from_template(
    """Given the following response,extract all factual claims as a numbered list.
    Each claim should be single, verifiable statement
    
    Response:{test_response}
    Extract all Factual Claims
    """
)

llm=ChatOpenAI(model='gpt-3.5-turbo')

chain=prompt_template|llm|StrOutputParser()

result=chain.invoke({"test_response":test_response})
print(result)

1. LLMs do not have personal beliefs.
2. LLMs only generate text based on learned patterns.


Match Claims

In [None]:
claims=[
    "LLMs do not have personal beliefs.",
    "LLMs only generate text based on learned patterns.",
    "Today is a Sunny day"
]
prompt_template=ChatPromptTemplate.from_template("""
Given the Claim and Context,verify if the claim is Supported by the context

Claim:{Claim}
Context:{Context}

Answer with:
 - "SUPPORTED" if the context supports the claim
 - "NOT SUPPORTED" if the context doesn't support the claim

 Also give a Brief Explanation
 Verdict: 
"""
)
llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()

verification_results=[]

for claim in claims:
    result=chain.invoke(
        {"Claim":claim,"Context":test_context[0]}
    )
    is_supported="SUPPORTED" in result.upper() and "NOT SUPPORTED" not in result.upper()
    verification_results.append(
        {
            "claim":claim,
            "is_supported":is_supported,
            "Explanation":result
        }
    )

print(verification_results)




[{'claim': 'LLMs do not have personal beliefs.', 'is_supported': True, 'Explanation': 'SUPPORTED\n\nExplanation: The context clearly states that LLMs do not have consciousness or personal beliefs, meaning they do not possess the ability to hold personal beliefs. Therefore, the claim that LLMs do not have personal beliefs is supported by the context.'}, {'claim': 'LLMs only generate text based on learned patterns.', 'is_supported': True, 'Explanation': 'SUPPORTED\n\nExplanation: The context mentions that LLMs do not have consciousness or personal beliefs, indicating that they generate text based solely on learned patterns rather than personal experiences or opinions. This supports the claim that LLMs only generate text based on learned patterns.'}, {'claim': 'Today is a Sunny day', 'is_supported': False, 'Explanation': 'NOT SUPPORTED\n\nExplanation: The context provided is about LLMs being trained on text datasets and not having consciousness or personal beliefs. It does not mention any

Calculate Faithfullness

In [22]:
supported_claims=sum(1 for r in verification_results if r["is_supported"])
total_claims=len(verification_results)

faithfilness_score=supported_claims/total_claims
print(f"{faithfilness_score:.2f}")

0.67


Ragas Implementation

In [40]:
failthfulness_sample=SingleTurnSample(
    user_input="Do Large Language Models have personal beliefs?",
    response=test_response,
    retrieved_contexts=test_context
)

faithfulness_metric=Faithfulness(llm=llm)
faithfulness_score=run_async_function(faithfulness_metric.single_turn_ascore(failthfulness_sample))
print(faithfilness_score)



0.6666666666666666


In [49]:
faithfulness_examples = [
    {
        "name": "Perfect Faithfulness (No hallucinations)",
        "response": "The first Super Bowl was played on January 15, 1967 at the Los Angeles Memorial Coliseum.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    },
    {
        "name": "Partial Faithfulness (Some hallucinations)",
        "response": "The first Super Bowl was on January 15, 1967. The Green Bay Packers won 35-10 with Bart Starr as MVP.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967."]
    },
    {
        "name": "Zero Faithfulness (Complete hallucination)",
        "response": "The first Super Bowl was held in Miami in 1970 and attracted over 100,000 spectators.",
        "context": ["The First AFL-NFL World Championship Game was played on January 15, 1967, at the Los Angeles Memorial Coliseum."]
    }
]

faithfulness_metric=Faithfulness(llm=llm)
for items in faithfulness_examples:
    faithfulness_sample=SingleTurnSample(
        user_input="Tell me about the first Super Bowl",
        response=items["response"],
        retrieved_contexts=items["context"]
    )

    faithfulness_score=run_async_function(faithfulness_metric.single_turn_ascore(faithfulness_sample))

    print(items["name"])
    print(f"{faithfulness_score:.2f}")
    print('*'*50)

Perfect Faithfulness (No hallucinations)
0.50
**************************************************
Partial Faithfulness (Some hallucinations)
0.00
**************************************************
Zero Faithfulness (Complete hallucination)
0.00
**************************************************


Answer Relevance

Hypothetical Question Generation

In [69]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

question="Do Large Language Models have personal beliefs?"
llm_response="No, LLMs do not have personal beliefs and only generate text based on learned patterns."

prompt_template=ChatPromptTemplate.from_template(
    """
From the Answer given generate Hypothecial questions which are Questions to the answer.Strictly ensure that to all the questions generated
they should all have the same answer i.e the answer given

Answer:{Answer}

Construct 3 questions in a numbered manner like
1.
2.
3.
"""
)


llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()

result=chain.invoke({"Answer":llm_response})



Cosine SImilarity Formula

In [None]:
import numpy as np
from numpy.linalg import norm
def calculate_cosine_similarity(vec1,vec2):
    vec1=np.array(vec1)
    vec2=np.array(vec2)
    num=np.dot(vec1,vec2)
    dem=norm(vec1)*norm(vec2)
    return num/dem




0.0


Calculate Similarities Of the Original Question to the Questions generated

In [None]:
from langchain_openai import OpenAIEmbeddings
generated_questions=[
    "Do LLMs possess personal beliefs when generating text?",
    "Are LLMs only capable of generating text based on learned patterns?",
    "Can LLMs form their own personal opinions when generating text?"
]
embeddings=OpenAIEmbeddings(model='text-embedding-3-small')
original_embedding=embeddings.embed_query(question)

similarity_index=[]
for items in generated_questions:
    sim=embeddings.embed_query(items)
    cosine=calculate_cosine_similarity(original_embedding,sim)
    similarity_index.append(cosine)





[np.float64(0.5985796899687075), np.float64(0.4147325779462339), np.float64(0.46830935807152213)]


Mean of the Similarities

In [68]:
import numpy as np

a=np.mean(similarity_index)
print(a)

0.49387387532882115


Using the RAGS

In [95]:
response_relevancy_sample=SingleTurnSample(
    user_input=question,
    response=llm_response,
    retrieved_contexts=test_context
)

response_relevancy_metric=ResponseRelevancy(
    llm=llm,embeddings=embeddings
)

response_relevancy_score=run_async_function(response_relevancy_metric.single_turn_ascore(response_relevancy_sample))

print(response_relevancy_score)

0.5850632477874373


Retriever

Context Precision

In [87]:
query="What is an LLM?"
response="An LLM is an AI model trained on large text datasets to generate and understand human language."

chunks=[
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items.",
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation."
]



In [83]:
prompt_template=ChatPromptTemplate.from_template(
"""
For the Given Question and Answer verify if the retrieved chunk is relevant or not
Question:{Question},
Answer:{Answer}
Chunk:{Chunk}
If the chunk is relevant give RELEVANT else give IRRELEVANT
"""
)

llm=ChatOpenAI(model='gpt-3.5-turbo')
chain=prompt_template|llm|StrOutputParser()
relevancy_matrix=[]
for chunk in chunks:
    result=chain.invoke(
        {
            "Question":query,
            "Answer":response,
            "Chunk":chunk
        }
    )
    is_relevant="RELEVANT" in result.upper() and "IRRELEVANT" not in result.upper()
    relevancy_matrix.append(is_relevant)

print(relevancy_matrix)

[False, False, True, True]


Precision count

In [84]:

precision_good=[]
relevant_count=0
for k,relevancy in enumerate(relevancy_matrix,1):
    if relevancy:
        relevant_count+=1
        precision_at_k=relevant_count/k
        precision_good.append(precision_at_k)



total_relevancy_matrix=sum(relevancy_matrix)

total_context_precision=sum(precision_good)/sum(relevancy_matrix) if total_relevancy_matrix>0 else 0
print(total_context_precision)

0.41666666666666663


Ragas

In [89]:
bad_sample=[
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items.",
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation."
]

good_sample=[
    "Large Language Models are trained on massive text data and can generate human-like responses.",
    "LLMs are commonly used for tasks like question answering and text generation.",
    "The Eiffel Tower is located in Paris and was completed in 1889.",
    "Python lists are mutable data structures used to store collections of items."
]

context_precision_sample_good=SingleTurnSample(
    user_input=query,
    reference=response,
    retrieved_contexts=good_sample
)
context_precision_sample_bad=SingleTurnSample(
    user_input=query,
    reference=response,
    retrieved_contexts=bad_sample
)

context_precision_metric=LLMContextPrecisionWithReference(llm=llm)
good_result=run_async_function(context_precision_metric.single_turn_ascore(context_precision_sample_good))
bad_result=run_async_function(context_precision_metric.single_turn_ascore(context_precision_sample_bad))
print(good_result)
print(bad_result)

0.99999999995
0.4166666666458333


Conext Recall

In [91]:
query="Tell me about Eiffel Tower"
recall_reference="The Eifel Tower is Located in Paris.It was buit in 1889.It is 330 meters tall"

recall_context=[
    "The Eifel Tower is a Landmark Located in Paris,France",
    "The Tower was completed in 1889or the World's Fair"
]
recall_claims=[
    "The Eifel Tower is Located in Paris",
    "It was buit in 1889",
    "It is 330 meters tall"
]
prompt_template=ChatPromptTemplate.from_template(
    """
    Check if the the Following Claims can be attributed to the Context
    Claim:{Claim}
    Context:{Context}

    if the Claim is supported by the Context gives "YES" if not then gine "NO"
    """
)

chain=prompt_template|llm|StrOutputParser()
combined_context="\n".join(recall_context)
claims=[]
for claim in recall_claims:
    result=chain.invoke({
        "Claim":claim,
        "Context":combined_context
    })
    is_supported="YES" in result.upper() and "NO" not in result.upper()
    claims.append(is_supported)
    print(f"Claim is {claim} and is {is_supported}")


Claim is The Eifel Tower is Located in Paris and is True
Claim is It was buit in 1889 and is True
Claim is It is 330 meters tall and is False


Ragas

In [93]:
context_recall_sample=SingleTurnSample(
    user_input=query,
    reference=recall_reference,
    retrieved_contexts=recall_context
)

context_recall_metric=LLMContextRecall(llm=llm)
context_recall_score=run_async_function(context_recall_metric.single_turn_ascore(context_recall_sample))

print(context_recall_score)

0.6666666666666666


Context Entity Recall

In [117]:
entity_reference = "William Shakespeare wrote many famous plays such as Hamlet and Romeo and Juliet in England during the late 16th and early 17th century."
entity_context = [
    "William Shakespeare was an English playwright known for his influential works in literature."
]

prompt_template=ChatPromptTemplate.from_template(
"""
Extract all named entities from the following Text.
Include: PERSON,NAME,DATE,LOCATION and other proper nouns
Text:{Text}
List each entity on a new line with its type:
"""
)

chain = prompt_template|llm|StrOutputParser()
reference_result=chain.invoke({"Text":entity_reference})
context_result=chain.invoke({"Text":entity_context[0]})
print(reference_result)
print('@'*50)
print(context_result)

PERSON: William Shakespeare
LOCATION: England
LOCATION: 16th
LOCATION: 17th century
PLAY: Hamlet
PLAY: Romeo and Juliet
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
PERSON: William Shakespeare
LOCATION: English
DATE: N/A
WORK_OF_ART: literature


In [115]:
reference_entities = {
    "William Shakespeare": "PERSON",
    "Hamlet": "PERSON",
    "Romeo": "PERSON",
    "Juliet": "PERSON",
    "England": "LOCATION",
    "late 16th and early 17th century": "DATE"
}

context_entities = {
    "William Shakespeare": "PERSON",
    "English": "LOCATION",
    "playwright": "PERSON"
}


common_entities_reference_context=reference_entities.keys() and context_entities.keys()
common_entities_reference_context_size=len(common_entities_reference_context)
reference_entities_size=len(reference_entities.keys())
context_entity_recall=common_entities_reference_context_size/reference_entities_size

print(context_entity_recall)




0.5


Ragas

In [119]:
context_entity_recall_sample=SingleTurnSample(
    reference=entity_reference,
    retrieved_contexts=entity_context
)

context_entity_recall_metric=ContextEntityRecall(llm=llm)
context_entity_recall=run_async_function(context_entity_recall_metric.single_turn_ascore(context_entity_recall_sample))
print(context_entity_recall)

0.1999999996


Noise Reduction

In [121]:
noise_question = "What is LIC known for?"
noise_response = "LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability."
noise_reference = "LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments."

noise_contexts = [
    "LIC was established in 1956 following nationalization.",      
    "LIC is the largest insurance company with huge investments.",     
    "LIC manages substantial funds for financial stability.",           
    "The Indian economy is one of the fastest-growing economies..."     
]

In [122]:
from langchain_core.prompts import ChatMessagePromptTemplate

prompt_template=ChatPromptTemplate.from_template(
"""
For the reference given Below verify if the a given context directly support the reference or not

reference:{reference}
context:{context}

return TRUE if the context directly supports the reference or FALSE if it doesnot directly supports the reference
"""
)
chain=prompt_template|llm|StrOutputParser()
for context in noise_contexts:
    result=chain.invoke(
        {
            "reference":noise_reference,
            "context":context
        }
    )
    print(f"Context{context} {result}")

ContextLIC was established in 1956 following nationalization. True
ContextLIC is the largest insurance company with huge investments. TRUE
ContextLIC manages substantial funds for financial stability. TRUE
ContextThe Indian economy is one of the fastest-growing economies... FALSE


In [130]:
response_claims = [
    ("LIC is the largest insurance company in India", True, "Matches reference"),
    ("LIC is known for its vast portfolio", True, "Matches reference (portfolio)"),
    ("LIC contributes to financial stability", False, "NOT in reference - possible hallucination from noise!")
]

incorrect_count = 0
for claim in response_claims:
    if not claim[1]:
        incorrect_count+=1

noise=incorrect_count/len(response_claims)
print(noise)

0.3333333333333333


RAGAS

In [133]:
noise_sample=SingleTurnSample(
    user_input="What is LIC known for?",
    response="LIC is the largest insurance company in India, known for its vast portfolio. LIC contributes to financial stability.",
    reference="LIC is the largest insurance company in India, established in 1956. It is known for managing a large portfolio of investments.",
    retrieved_contexts=noise_contexts
)

noise_metric=NoiseSensitivity(llm=llm)
noise=run_async_function(noise_metric.single_turn_ascore(noise_sample))

print(noise)

0.3333333333333333
