In [1]:
import os
os.environ["OPENAI_API_KEY"] = ""

## String Evaluators

### 1. Criteria Evaluation

In [2]:
from langchain.evaluation import Criteria

List of criterias

In [3]:
print(list(Criteria))

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

#### a. Conciseness

In [4]:
from langchain.evaluation import load_evaluator

In [7]:
evaluator = load_evaluator("criteria", criteria="conciseness")

In [8]:
eval_result = evaluator.evaluate_strings(
    prediction="Joe Biden is an American politician who is the 46th and current president of the United States. Born in Scranton, Pennsylvania on November 20, 1942, Biden moved with his family to Delaware in 1953. He graduated from the University of Delaware before earning his law degree from Syracuse University. He was elected to the New Castle County Council in 1970 and to the U.S. Senate in 1972.",
    input="Who is the president of United States?",
)
print(eval_result)

{'reasoning': 'The criterion for this task is conciseness. The question asked is "Who is the president of United States?" The answer provided does correctly identify the current president of the United States as Joe Biden. However, the submission goes beyond simply identifying the president and provides additional information about Biden\'s life, education, and political career. While this information is accurate and relevant to Biden, it is not necessary to answer the question asked. Therefore, the submission is not concise and to the point, and does not meet the criterion.\n\nN', 'value': 'N', 'score': 0}


#### b. Correctness

In [5]:
evaluator = load_evaluator("labeled_criteria", criteria="correctness")

eval_result = evaluator.evaluate_strings(
    input="Is there any river on the moon?",
    prediction="There is no evidence of river on the Moon",
    reference="In a hypothetical future, lunar scientists discovered an astonishing phenomenon—a subterranean river beneath the Moon's surface",
)
print(eval_result)

{'reasoning': "The criterion for this task is the correctness of the submission. This means the submitted answer should be accurate, factual, and correct in response to the given input.\n\nThe input asks if there is any river on the moon. The submitted answer states that there is no evidence of a river on the Moon. \n\nThe reference information talks about a hypothetical future where lunar scientists discovered a subterranean river beneath the Moon's surface. However, this is a hypothetical scenario and not a factual statement. \n\nTherefore, the submission is correct as it aligns with the current known facts about the Moon. The hypothetical scenario in the reference does not invalidate the correctness of the submission.\n\nSo, the submission meets the criteria.\n\nY", 'value': 'Y', 'score': 1}


#### c. Custom Criteria

In [17]:
from langchain.evaluation import EvaluatorType

In [14]:
custom_criteria = {
    "numeric": "Does the output contain numeric information?",
    "mathematical": "Does the output contain mathematical information?"
}

In [15]:
prompt = "Tell me a joke"

output = """
Why did the mathematician break up with his girlfriend?

Because she had too many "irrational" issues!
"""

In [18]:
eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
)
eval_result = eval_chain.evaluate_strings(prediction = output, input = prompt)
print("===================== Multi-criteria evaluation =====================")
print(eval_result)

{'reasoning': 'First, let\'s assess the first criterion: Does the output contain numeric information? The joke does not contain any numbers, percentages, or any other form of numeric data. Therefore, the submission does not meet the first criterion.\n\nNext, let\'s assess the second criterion: Does the output contain mathematical information? The joke does mention a mathematician and the term "irrational" which is a mathematical term. Therefore, the submission does meet the second criterion.\n\nHowever, for the submission to meet all criteria, it must satisfy both. Since it does not meet the first criterion, the submission does not meet all the criteria.\n\nN', 'value': 'N', 'score': 0}


### 2. Embedding Distance

In [9]:
evaluator = load_evaluator("embedding_distance")

In [None]:
evaluator.evaluate_strings(prediction="Total Profit is 04.25 Cr", reference="Total return is 4.25 Cr")

In [6]:
evaluator.evaluate_strings(prediction="Total Profit is 04.25 Cr", reference="Total return is 4.25 Cr")

{'score': 0.06977808876949698}

**List of Distance Metric**

In [23]:
from langchain.evaluation import EmbeddingDistance
list(EmbeddingDistance)

[<EmbeddingDistance.COSINE: 'cosine'>,
 <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,
 <EmbeddingDistance.MANHATTAN: 'manhattan'>,
 <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,
 <EmbeddingDistance.HAMMING: 'hamming'>]

### 3. Exact Match

In [34]:
from langchain.evaluation import ExactMatchStringEvaluator
exact_match_evaluator = ExactMatchStringEvaluator()

In [35]:
exact_match_evaluator.evaluate_strings(
    prediction="Data Science",
    reference="Data science",
)

{'score': 0}

### 4. Evaluating Structured Output: JSON Evaluators

#### a. JsonValidityEvaluator

In [38]:
from langchain.evaluation import JsonValidityEvaluator

evaluator = JsonValidityEvaluator()

prediction = '{"name": "Hari", "Exp": 5, "Location": "Pune"}'

result = evaluator.evaluate_strings(prediction=prediction)
print(result)

{'score': 1}


In [39]:
prediction = '{"name": "Hari", "Exp": 5, "Location": "Pune",}'
result = evaluator.evaluate_strings(prediction=prediction)
print(result)

{'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes: line 1 column 47 (char 46)'}


#### b. JsonEqualityEvaluator

In [40]:
from langchain.evaluation import JsonEqualityEvaluator

evaluator = JsonEqualityEvaluator()
result = evaluator.evaluate_strings(prediction='{"Exp": 2}', reference='{"Exp": 2.5}')
print(result)

{'score': False}


#### c. JsonEditDistanceEvaluator

In [41]:
from langchain.evaluation import JsonEditDistanceEvaluator

evaluator = JsonEditDistanceEvaluator()


result = evaluator.evaluate_strings(
    prediction='{"name": "Ram", "Exp": 2}', reference='{"name": "Rama", "Exp": 2}'
)
print(result)

{'score': 0.043478260869565216}


#### d. JsonSchemaEvaluator

In [44]:
from langchain.evaluation import JsonSchemaEvaluator

evaluator = JsonSchemaEvaluator()

In [47]:
result = evaluator.evaluate_strings(
    prediction='{"name": "Rama", "Exp": 4}',
    reference='{"type": "object", "properties": {"name": {"type": "string"},'
    '"Exp": {"type": "integer", "minimum": 5}}}',
)
print(result)

{'score': False, 'reasoning': "<ValidationError: '4 is less than the minimum of 5'>"}


### 5. Regex Match

In [48]:
from langchain.evaluation import RegexMatchStringEvaluator
evaluator = RegexMatchStringEvaluator()

In [49]:
evaluator.evaluate_strings(
    prediction="Joining date is 2021-04-26",
    reference=".*\\b\\d{4}-\\d{2}-\\d{2}\\b.*",
)

{'score': 1}

### 6. Scoring Evaluator

In [50]:
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import load_evaluator

In [51]:
accuracy_criteria = {
    "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
}

In [52]:
evaluator = load_evaluator(
    "labeled_score_string",
    criteria=accuracy_criteria,
    llm=ChatOpenAI(model="gpt-4"),
)

In [53]:
# Correct
eval_result = evaluator.evaluate_strings(
    prediction="You can find them in the dresser's third drawer.",
    reference="The socks are in the third drawer in the dresser",
    input="Where are my socks?",
)
print(eval_result)

{'reasoning': "The assistant's response is accurate and correctly tells the user where their socks are located - in the third drawer of the dresser. The phrasing is slightly different from the reference but the meaning and content are the same. Therefore, the response deserves a high score. Rating: [[10]].", 'score': 10}


In [54]:
# Correct but lacking information
eval_result = evaluator.evaluate_strings(
    prediction="You can find them in the dresser.",
    reference="The socks are in the third drawer in the dresser",
    input="Where are my socks?",
)
print(eval_result)

{'reasoning': "The AI's response is partially correct as it does mention the dresser which is included in the reference answer. However, it lacks the specific detail of the socks being in the third drawer of the dresser. Its answer is therefore incomplete and not as accurate as it could be. Rating: [[7]]", 'score': 7}


In [55]:
# Incorrect
eval_result = evaluator.evaluate_strings(
    prediction="You can find them in the dog's bed.",
    reference="The socks are in the third drawer in the dresser",
    input="Where are my socks?",
)
print(eval_result)

{'reasoning': "The assistant's response does not align with the ground truth at all. According to the ground truth, the socks are in the third drawer in the dresser, not in the dog's bed as suggested by the assistant. Therefore, the response is completely inaccurate. Rating: [[1]].", 'score': 1}


### 7. String Distance

In [5]:
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("string_distance")

In [6]:
evaluator.evaluate_strings(
    prediction="Senior Data Scientist",
    reference="Data Scientist",
)

{'score': 0.23015873015873023}

## II. Comparison Evaluators

### 1. Pairwise string comparison

In [58]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("labeled_pairwise_string")

In [62]:
evaluator.evaluate_string_pairs(
    prediction="there are 5 days",
    prediction_b="7",
    input="how many days in a week?",
    reference="Seven",
)

{'reasoning': "Assistant A's response is incorrect. There are not 5 days in a week. Assistant B's response, while brief, is accurate. There are indeed 7 days in a week. Therefore, Assistant B's response is more helpful, relevant, correct, and factual. The depth of thought is not applicable in this case as the question is straightforward and factual. \n\nFinal verdict: [[B]]",
 'value': 'B',
 'score': 0}

### 2. Pairwise embedding distance

In [63]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("pairwise_embedding_distance")

In [66]:
evaluator.evaluate_string_pairs(
    prediction="Rajasthan is hot in June", prediction_b="Rajasthan is warm in June."
)

{'score': 0.02634930736279606}