In [None]:
#12.1.1 Criteria Evaluators

from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain
from langchain.evaluation import load_evaluator
from langchain.evaluation import EvaluatorType
import os

api_key = ''
os.environ['OPENAI_API_KEY'] = api_key

template = """You are a Helpful Assistant that explains everyhing being asked"""

system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
chain = LLMChain(
    llm=ChatOpenAI(openai_api_key=api_key),
    prompt=chat_prompt
)

def evaluate(criteria,sentences):
    evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=criteria)

    for prompt in sentences:
          prediction = chain.run(prompt)
          eval_result = evaluator.evaluate_strings(
              prediction= prediction,
              input=prompt,
          )
          print('\nPROMPT : ',prompt)
          print('RESULT :\n','\n'.join(prediction.replace('\n','').split('.')[:-1]))
          print('REASON :\n','\n'.join(eval_result['reasoning'].replace('\n','').split('.')[:-1]))
          print('VALUE : ',eval_result['value'])
          print('SCORE : ',eval_result['score'])

evaluate('conciseness',['Explain road not taken by Robert Frost'])
evaluate('creativity',['What would happen if everyone becomes immortal?'])

Installing collected packages: mypy-extensions, marshmallow, jsonpointer, h11, typing-inspect, langsmith, jsonpatch, httpcore, langchain-core, httpx, dataclasses-json, openai, langchain
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed dataclasses-json-0.6.3 h11-0.14.0 httpcore-1.0.2 httpx-0.25.2 jsonpatch-1.33 jsonpointer-2.4 langchain-0.0.348 langchain-core-0.0.12 langsmith-0.0.69 marshmallow-3.20.1 mypy-extensions-1.0.0 openai-1.3.8 typing-inspect-0.9.0

PROMPT :  Explain road not taken by Robert Frost
RESULT :
 "The Road Not Taken" is a famous poem written by Robert Frost
 It was first published in 1916 as part of Frost's collection of poems titled "Mountain Interval
" The poem is often interpreted as an expl

In [None]:
def evaluate(criteria,sentences,ground_truth):
    evaluator = load_evaluator("labeled_criteria" criteria=criteria)

    for index,prompt in enumerate(sentences):
          prediction = chain.run(prompt)
          eval_result = evaluator.evaluate_strings(
              prediction= prediction,
              input=prompt,
              reference = ground_truth[index]
          )
          print('\nPROMPT : ',prompt)
          print('RESULT :\n','\n'.join(prediction.replace('\n','').split('.')[:-1]))
          print('REASON :\n','\n'.join(eval_result['reasoning'].replace('\n','').split('.')[:-1]))
          print('VALUE : ',eval_result['value'])
          print('SCORE : ',eval_result['score'])

evaluate('correctness',['Is Hockey a sports?'],['Yes'])

In [None]:
#12.1.2 Custom Evaluators

score_criteria = {
    "custom_metric": """

0: Completely incorrect
1-3: Partially incorrect or minimally relevant
4-6: Mix of correct and incorrect information, with notable errors
7-8: Mostly accurate, with minor errors or omissions
9: Nearly flawless, with very minor errors or negligible missing details
10: Flawless, meeting or exceeding expectations"""
}
prompt = 'How does a life of a general human looks like? starting from being a baby to being old'
evaluator = load_evaluator("labeled_score_string",criteria=score_criteria, llm=ChatOpenAI(model="gpt-4",openai_api_key=api_key))
prediction = chain.run(prompt)

# Correct
eval_result = evaluator.evaluate_strings(
    prediction= prediction,
    reference="A human life follows a general trajectory from infancy to old age.\
    In infancy, rapid physical and cognitive development occurs, followed by early childhood where formal education begins.\
    Adolescence brings puberty and identity formation, leading to young adulthood marked by career and relationship pursuits.\
    Middle adulthood sees professional and family responsibilities, while late adulthood involves retirement,\
    reflection, and potential health challenges. Old age is characterized by increased dependency,\
    wisdom-sharing, and end-of-life considerations.\
    Individual experiences vary based on factors like culture and personal choices,\
    and improvements in healthcare can influence life trajectories.",
    input=prompt
)
print('LLMs answer:','\n'.join(prediction.split('.')))
print('\n'.join(eval_result['reasoning'].split('.')))

LLMs answer: The life of a general human typically follows a series of stages and experiences, starting from being a baby and progressing through childhood, adolescence, adulthood, and finally old age
 Let's explore these stages in more detail:

1
 Babyhood: This stage begins at birth and lasts until around two years old
 Babies are completely dependent on their caregivers for their basic needs, such as feeding, diaper changes, and comfort
 They gradually develop motor skills, learn to communicate through sounds and gestures, and form attachments to their primary caregivers


2
 Childhood: Childhood spans from around two years old to adolescence
 During this stage, children grow physically, develop more advanced motor skills, and acquire language and cognitive abilities
 They begin to explore their surroundings, attend school, and develop social skills through interactions with peers
 Childhood is often characterized by curiosity, imagination, playfulness, and rapid psychological and e

In [None]:
custom_criteria = {
    "Humor": "The assistant's answer should have a sense of humor",
    "Impact":"What kind of lasting impression does the text leave on the reader?"
}
prompt = 'Tell a joke I can crack infront of teenagers'
evaluator = load_evaluator("score_string",criteria=custom_criteria, llm=ChatOpenAI(openai_api_key=api_key))
prediction = chain.run(prompt)
# Correct
eval_result = evaluator.evaluate_strings(
    prediction=prediction,
    input=prompt
)
print('LLMs answer:','\n'.join(prediction.split('.')))
print('\n'.join(eval_result['reasoning'].split('.')))



LLMs answer: Sure, here's a joke that teenagers might enjoy:

Why don't scientists trust atoms?

Because they make up everything!
Explanation:
The assistant's response demonstrates a good sense of humor by providing a pun-style joke that teenagers might find amusing
 The joke plays on the double meaning of "make up" to create a humorous twist
 It leaves a light and playful impression on the reader


Rating: [[9]]


In [None]:
#12.2 Comparison Evaluators

from langchain.llms import HuggingFaceHub
from langchain.evaluation.comparison import PairwiseStringEvalChain
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", huggingfacehub_api_token='', model_kwargs={"temperature": 0})

chain = PairwiseStringEvalChain.from_llm(llm=llm)
result = chain.evaluate_string_pairs(
    input = "What is 2+3?",
    prediction = "five is the answer",
    prediction_b = "If I add 2+3, I might get 6.",
    reference = "5",
)
print(result)



In [None]:
#12.3 Trajectory Evaluators

from langchain.evaluation import load_evaluator

evaluator = load_evaluator("trajectory",llm=llm)

evaluation_result = evaluator.evaluate_agent_trajectory(
    prediction=result["output"],
    input=result["input"],
    agent_trajectory=result["intermediate_steps"],
)
evaluation_result