In [2]:
import configparser
import statistics
config = configparser.ConfigParser()
config.read('config.ini')

import os
os.environ["OPENAI_API_KEY"] = config['API']['OPENAI_API_KEY']

from deepeval.metrics import *
from deepeval.test_case import *

from deepeval.benchmarks import *
from deepeval.benchmarks.tasks import *

from langchain_community.llms import GPT4All
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.output_parsers import StrOutputParser

input = "The dog chased the cat up the tree, who ran up the tree?"
expected_output = "cat."
local_path = './models/nous-hermes-llama2-13b.Q4_0.gguf'


model = ChatOpenAI(
    verbose=True,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0.7,
)
llm_chain = (                     
    PromptTemplate(input_variables=["question"], template=input)
    | model 
    | StrOutputParser()
)
actual_output = llm_chain.invoke({"question": ""})

# model = GPT4All(
#     model=local_path,
#     # callbacks=[ChainStreamHandler(g)],
#     streaming=True,
#     verbose=True,
# )
# llm_chain = PromptTemplate(input_variables=["text"], template=input) | model
# actual_output = llm_chain.invoke({"text": ""})
# _res

  from .autonotebook import tqdm as notebook_tqdm
  warn_deprecated(


The cat ran up the tree.

In [9]:
mmlu_benchmark = MMLU().load_benchmark_dataset(MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE)
data = [{'input': mmlu.input, 'expected_output': mmlu.expected_output} for mmlu in mmlu_benchmark]
data[:3], len(data)

([{'input': 'Let x = 1. What is x << 3 in Python 3?\nA. 1\nB. 3\nC. 8\nD. 16\nAnswer:',
   'expected_output': 'C'},
  {'input': 'In Python 3, which of the following function convert a string to an int in python?\nA. int(x [,base])\nB. long(x [,base] )\nC. float(x)\nD. str(x)\nAnswer:',
   'expected_output': 'A'},
  {'input': "A user enters a Web address in a browser, and a request for a file is sent to a Web server. Which of the following best describes how the file is sent to the user?\nA. The file is broken into packets for transmission. The packets must be reassembled upon receipt.\nB. The file is broken into packets for transmission. The user's browser must request each packet in order until all packets are received.\nC. The server attempts to connect directly to the user's computer. If the connection is successful, the entire file is sent. If the connection is unsuccessful, an error message is sent to the user.\nD. The server repeatedly attempts to connect directly to the user's c

In [3]:
benchmarks = [
    'APPLYING_SUNSCREEN',
    'TRIMMING_BRANCHES_OR_HEDGES',
    'DISC_DOG',
    'WAKEBOARDING',
    'SKATEBOARDING',
    'WATERSKIING',
    'WASHING_HANDS',
    'SAILING',
    'PLAYING_CONGAS',
    'BALLET',
    'ROOF_SHINGLE_REMOVAL',
    'HAND_CAR_WASH',
    'KITE_FLYING',
    'PLAYING_POOL',
    'PLAYING_LACROSSE',
    'LAYUP_DRILL_IN_BASKETBALL',
    'HOME_AND_GARDEN',
    'PLAYING_BEACH_VOLLEYBALL',
    'CALF_ROPING',
    'SCUBA_DIVING',
    'MIXING_DRINKS',
    'PUTTING_ON_SHOES',
    'MAKING_A_LEMONADE',
    'UNCATEGORIZED',
    'ZUMBA',
    'PLAYING_BADMINTON',
    'PLAYING_BAGPIPES',
    'FOOD_AND_ENTERTAINING',
    'PERSONAL_CARE_AND_STYLE',
    'CRICKET',
    'SHOVELING_SNOW',
    'PING_PONG',
    'HOLIDAYS_AND_TRADITIONS',
    'ICE_FISHING',
    'BEACH_SOCCER',
    'TABLE_SOCCER',
    'SWIMMING',
    'BATON_TWIRLING',
    'JAVELIN_THROW',
    'SHOT_PUT',
    'DOING_CRUNCHES',
    'POLISHING_SHOES',
    'TRAVEL',
    'USING_UNEVEN_BARS',
    'PLAYING_HARMONICA',
    'RELATIONSHIPS',
    'HIGH_JUMP',
    'MAKING_A_SANDWICH',
    'POWERBOCKING',
    'REMOVING_ICE_FROM_CAR',
    'SHAVING',
    'SHARPENING_KNIVES',
    'WELDING',
    'USING_PARALLEL_BARS',
    'HOME_CATEGORIES',
    'ROCK_CLIMBING',
    'SNOW_TUBING',
    'WASHING_FACE',
    'ASSEMBLING_BICYCLE',
    'TENNIS_SERVE_WITH_BALL_BOUNCING',
    'SHUFFLEBOARD',
    'DODGEBALL',
    'CAPOEIRA',
    'PAINTBALL',
    'DOING_A_POWERBOMB',
    'DOING_MOTOCROSS',
    'PLAYING_ICE_HOCKEY',
    'PHILOSOPHY_AND_RELIGION',
    'ARCHERY',
    'CARS_AND_OTHER_VEHICLES',
    'RUNNING_A_MARATHON',
    'THROWING_DARTS',
    'PAINTING_FURNITURE',
    'HAVING_AN_ICE_CREAM',
    'SLACKLINING',
    'CAMEL_RIDE',
    'ARM_WRESTLING',
    'HULA_HOOP',
    'SURFING',
    'PLAYING_PIANO',
    'GARGLING_MOUTHWASH',
    'PLAYING_ACCORDION',
    'HORSEBACK_RIDING',
    'PUTTING_IN_CONTACT_LENSES',
    'PLAYING_SAXOPHONE',
    'FUTSAL',
    'LONG_JUMP',
    'LONGBOARDING',
    'POLE_VAULT',
    'BUILDING_SANDCASTLES',
    'PLATFORM_DIVING',
    'PAINTING',
    'SPINNING',
    'CARVING_JACK_O_LANTERNS',
    'BRAIDING_HAIR',
    'YOUTH',
    'PLAYING_VIOLIN',
    'CANOEING',
    'CHEERLEADING',
    'PETS_AND_ANIMALS',
    'KAYAKING',
    'CLEANING_SHOES',
    'KNITTING',
    'BAKING_COOKIES',
    'DOING_FENCING',
    'PLAYING_GUITARRA',
    'USING_THE_ROWING_MACHINE',
    'GETTING_A_HAIRCUT',
    'MOOPING_FLOOR',
    'RIVER_TUBING',
    'CLEANING_SINK',
    'GROOMING_DOG',
    'DISCUS_THROW',
    'CLEANING_WINDOWS',
    'FINANCE_AND_BUSINESS',
    'HANGING_WALLPAPER',
    'ROPE_SKIPPING',
    'WINDSURFING',
    'KNEELING',
    'GETTING_A_PIERCING',
    'ROCK_PAPER_SCISSORS',
    'SPORTS_AND_FITNESS',
    'BREAKDANCING',
    'WALKING_THE_DOG',
    'PLAYING_DRUMS',
    'PLAYING_WATER_POLO',
    'BMX',
    'SMOKING_A_CIGARETTE',
    'BLOWING_LEAVES',
    'BULLFIGHTING',
    'DRINKING_COFFEE',
    'BATHING_DOG',
    'TANGO',
    'WRAPPING_PRESENTS',
    'PLASTERING',
    'PLAYING_BLACKJACK',
    'FUN_SLIDING_DOWN',
    'WORK_WORLD',
    'TRIPLE_JUMP',
    'TUMBLING',
    'SKIING',
    'DOING_KICKBOXING',
    'BLOW_DRYING_HAIR',
    'DRUM_CORPS',
    'SMOKING_HOOKAH',
    'MOWING_THE_LAWN',
    'VOLLEYBALL',
    'LAYING_TILE',
    'STARTING_A_CAMPFIRE',
    'SUMO',
    'HURLING',
    'PLAYING_KICKBALL',
    'MAKING_A_CAKE',
    'FIXING_THE_ROOF',
    'PLAYING_POLO',
    'REMOVING_CURLERS',
    'ELLIPTICAL_TRAINER',
    'HEALTH',
    'SPREAD_MULCH',
    'CHOPPING_WOOD',
    'BRUSHING_TEETH',
    'USING_THE_POMMEL_HORSE',
    'SNATCH',
    'CLIPPING_CAT_CLAWS',
    'PUTTING_ON_MAKEUP',
    'HAND_WASHING_CLOTHES',
    'HITTING_A_PINATA',
    'TAI_CHI',
    'GETTING_A_TATTOO',
    'DRINKING_BEER',
    'SHAVING_LEGS',
    'DOING_KARATE',
    'PLAYING_RUBIK_CUBE',
    'FAMILY_LIFE',
    'ROLLERBLADING',
    'EDUCATION_AND_COMMUNICATIONS',
    'FIXING_BICYCLE',
    'BEER_PONG',
    'IRONING_CLOTHES',
    'CUTTING_THE_GRASS',
    'RAKING_LEAVES',
    'PLAYING_SQUASH',
    'HOPSCOTCH',
    'INSTALLING_CARPET',
    'POLISHING_FURNITURE',
    'DECORATING_THE_CHRISTMAS_TREE',
    'PREPARING_SALAD',
    'PREPARING_PASTA',
    'VACUUMING_FLOOR',
    'CLEAN_AND_JERK',
    'COMPUTERS_AND_ELECTRONICS',
    'CROQUET'
]

In [4]:
import pandas as pd

data_list = []
for idx, b in enumerate(benchmarks):
    data_list += HellaSwag().load_benchmark_dataset(HellaSwagTask[b])

data = [{'input': obj.input, 'expected_output': obj.expected_output} for obj in data_list]
pd.DataFrame(data).to_csv('HellaSwagData.csv', encoding='utf-8-sig')

Filter: 100%|██████████| 10042/10042 [00:00<00:00, 54218.35 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 50195.51 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 55186.19 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 55788.79 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 57056.78 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 55481.98 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 53989.28 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 56734.91 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 57041.33 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 54562.72 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 53976.97 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 55568.87 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 54080.30 examples/s]
Filter: 100%|██████████| 10042/10042 [00:00<00:00, 56100.44 exam

In [7]:
data = [HellaSwag().load_benchmark_dataset(HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES), HellaSwag().load_benchmark_dataset(HellaSwagTask.BATON_TWIRLING)]

In [14]:
len(data[1])

14

In [4]:
# Define benchmark with specific tasks and shots
# benchmark = MMLU()
hella_benchmark = HellaSwag().load_benchmark_dataset(HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES)
data = [{'input': hella.input, 'expected_output': hella.expected_output} for hella in hella_benchmark]
data[:3], len(data)

([{'input': 'A man is shown working in an outdoor garden. he\nA. is using tools as he works.\nB. uses a large tool to bark out a few new leaves.\nC. uses trowels and strikes a tree.\nD. uses a sponge and a trowel to scrub down the side of a tree.\nAnswer:',
   'expected_output': 'A'},
  {'input': 'A man is standing outside turning on his hedge trimmer turning it around to make the blades go. He then walks over to the hedges and begins to cut down the leaves on it. as he\nA. cuts it asia walks towards the lawn.\nB. is cutting by himself, a car drives by that is parked over in the yard.\nC. is cutting it, he begins to walk into the hedges to cut the hedge evenly and shaking the cut leaves off with his hand.\nD. is doing his mowing he trips and falls down.\nAnswer:',
   'expected_output': 'C'},
  {'input': 'A man has climbed a large ladder outside. he\nA. is using trimmers to cut and trim large trees.\nB. ropes and lures a cow into the open.\nC. is using it to pull himself up onto a platf

In [5]:
TruthfulQA().load_benchmark_dataset(TruthfulQATask.ADVERTISING, TruthfulQAMode.MC1)

TypeError: TruthfulQA.load_benchmark_dataset() missing 1 required positional argument: 'mode'

In [None]:
hella_benchmark = TruthfulQA().load_benchmark_dataset(TruthfulQATask.TRIMMING_BRANCHES_OR_HEDGES)
data = [{'input': hella.input, 'expected_output': hella.expected_output} for hella in hella_benchmark]
data[:3], len(data)

In [12]:
metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' indicates the given 'expected output'",
        "It does not matter whether the 'actual output' has full context or not.",
        "If indicating to correct 'expected output' answer must be OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
) 

benchmark = HumanEval(
    tasks=[HumanEvalTask.HAS_CLOSE_ELEMENTS, HumanEvalTask.SORT_NUMBERS],
    n=100
)

metric = HallucinationMetric(threshold=0.5)

template = """
    You are a wise and precise chatbot.
    With the below given questions, you will have to solve the question and answer the question instructed with several options to select
    {question}
    Provide only the letter corresponding to your answer (e.g., "A", "B", "C", etc.).
"""

llm_chain = (                     
    PromptTemplate(input_variables=["question"], template=template)
    | model 
    | StrOutputParser()
)

test_result = []
for evaluate in data:
    # actual_output = llm_chain.invoke({"question": "It depends, some might consider the cat, while others might argue the dog."})
    actual_output = llm_chain.invoke({"question": evaluate['input']})
    # actual_output = "A"

    test_case = LLMTestCase(
        input=evaluate['input'],
        actual_output=actual_output,
        expected_output=evaluate['expected_output']
    )

    metric.measure(test_case)
    result = {'metric_score' : metric.score, 'metric_reason': metric.reason}
    test_result.append(result)
    break

test_result

A. is using tools as he works.

[{'metric_score': 1.0,
  'metric_reason': 'The actual output directly indicates the correct expected output answer A, confirming he is using tools as he works.'}]

In [13]:
len(test_result), test_result
avg = statistics.mean([eval['metric_score'] for eval in test_result])
print(f"모델: ChatOpenAI\nGEval 평가 평균: {avg}")

모델: ChatOpenAI
GEval 평가 평균: 1.0


In [15]:
import pandas as pd
pd.DataFrame(test_result).to_csv('result.csv', encoding='utf-8-sig')


Unnamed: 0,metric_score,metric_reason
0,1.0,The actual output directly indicates the corre...


: 