In [64]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')

import os
os.environ["OPENAI_API_KEY"] = config['API']['OPENAI_API_KEY']

from deepeval.metrics import *
from deepeval.test_case import *

from deepeval.benchmarks import MMLU, HellaSwag
from deepeval.benchmarks.tasks import MMLUTask, HellaSwagTask

from langchain_community.llms import GPT4All
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.output_parsers import StrOutputParser

input = "The dog chased the cat up the tree, who ran up the tree?"
expected_output = "cat."
local_path = './models/nous-hermes-llama2-13b.Q4_0.gguf'


model = ChatOpenAI(
    verbose=True,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0.7,
)
llm_chain = (                     
    PromptTemplate(input_variables=["question"], template=input)
    | model 
    | StrOutputParser()
)
actual_output = llm_chain.invoke({"question": ""})

# model = GPT4All(
#     model=local_path,
#     # callbacks=[ChainStreamHandler(g)],
#     streaming=True,
#     verbose=True,
# )
# llm_chain = PromptTemplate(input_variables=["text"], template=input) | model
# actual_output = llm_chain.invoke({"text": ""})
# _res

The cat ran up the tree.

In [31]:
mmlu_benchmark = MMLU().load_benchmark_dataset(MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE)
data = [{'input': mmlu.input, 'expected_output': mmlu.expected_output} for mmlu in mmlu_benchmark]
data[:3], len(data)

([{'input': 'Let x = 1. What is x << 3 in Python 3?\nA. 1\nB. 3\nC. 8\nD. 16\nAnswer:',
   'expected_output': 'C'},
  {'input': 'In Python 3, which of the following function convert a string to an int in python?\nA. int(x [,base])\nB. long(x [,base] )\nC. float(x)\nD. str(x)\nAnswer:',
   'expected_output': 'A'},
  {'input': "A user enters a Web address in a browser, and a request for a file is sent to a Web server. Which of the following best describes how the file is sent to the user?\nA. The file is broken into packets for transmission. The packets must be reassembled upon receipt.\nB. The file is broken into packets for transmission. The user's browser must request each packet in order until all packets are received.\nC. The server attempts to connect directly to the user's computer. If the connection is successful, the entire file is sent. If the connection is unsuccessful, an error message is sent to the user.\nD. The server repeatedly attempts to connect directly to the user's c

In [29]:
# Define benchmark with specific tasks and shots
# benchmark = MMLU()
hella_benchmark = HellaSwag().load_benchmark_dataset(HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES)
data = [{'input': hella.input, 'expected_output': hella.expected_output} for hella in hella_benchmark]
data[:3], len(data)

([{'input': 'A man is shown working in an outdoor garden. he\nA. is using tools as he works.\nB. uses a large tool to bark out a few new leaves.\nC. uses trowels and strikes a tree.\nD. uses a sponge and a trowel to scrub down the side of a tree.\nAnswer:',
   'expected_output': 'A'},
  {'input': 'A man is standing outside turning on his hedge trimmer turning it around to make the blades go. He then walks over to the hedges and begins to cut down the leaves on it. as he\nA. cuts it asia walks towards the lawn.\nB. is cutting by himself, a car drives by that is parked over in the yard.\nC. is cutting it, he begins to walk into the hedges to cut the hedge evenly and shaking the cut leaves off with his hand.\nD. is doing his mowing he trips and falls down.\nAnswer:',
   'expected_output': 'C'},
  {'input': 'A man has climbed a large ladder outside. he\nA. is using trimmers to cut and trim large trees.\nB. ropes and lures a cow into the open.\nC. is using it to pull himself up onto a platf

In [46]:
metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' indicates the given 'expected output'",
        "It does not matter whether the 'actual output' has full context or not.",
        "If indicating to correct 'expected output' answer must be OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
) 

template = """
    You are a wise and precise chatbot.
    With the below given questions, you will have to solve the question and answer the question instructed with several options to select
    {question}
    Provide only the letter corresponding to your answer (e.g., "A", "B", "C", etc.).
"""

llm_chain = (                     
    PromptTemplate(input_variables=["question"], template=template)
    | model 
    | StrOutputParser()
)

test_result = []
for evaluate in data:
    # actual_output = llm_chain.invoke({"question": "It depends, some might consider the cat, while others might argue the dog."})
    actual_output = llm_chain.invoke({"question": evaluate['input']})
    # actual_output = "A"

    test_case = LLMTestCase(
        input=evaluate['input'],
        actual_output=actual_output,
        expected_output=evaluate['expected_output']
    )

    metric.measure(test_case)
    result = {'metric_score' : metric.score, 'metric_reason': metric.reason}
    test_result.append(result)

test_result

D. 16

A. int(x [,base])

A. The file is broken into packets for transmission. The packets must be reassembled upon receipt.

A.

B. How many programming statements the program contains

D. 256

A. Do students majoring in computer science tend to have higher grade point averages than students majoring in other subjects? 

Answer: A

B. Only elements that appear in both inputList1 and inputList2

A.

A

B.

C.

C.

B.

C. Lossless compression

A. isupper()

C. (num MOD 2 ) = 0

D

D

A. Each packet contains data to be transmitted, along with metadata containing information used for routing the data.

C

B. aab

B. //

C.

B. I and II only

D. 10 times as many items can be uniquely identified.

C

C. num1 > num2 && num1 > num3

B.

A. Yes

A. An overflow error occurred.

C. O(n^2)

D. Interchanging line 7 and line 8

B. The ability to provide credibility to the information distributed

D

B. 1

A. 1001 0100

A. **

C. E7_{16}

D.

B

C. [786, 2.23]

B. Changing line 3 to a ← b + 10

B. 8

C. Both of the above.

C. 2

C. 24

A. a[i] == max

B. Generally, the advantage of using a binary search over a linear search increases as the size of the list increases.

C. O(log log N)

A. The song was saved using fewer bits per second than the original song.

C. max(list)

A. O(1)

C.

D. heads_counter = 2

A. Authentication

B. Changing line 6 to IF(myList[item] = val)

D

C. Queue | Dictionary/map | Stack

A. 1

D. 4

B. The ability to provide data transmission even when some connections have failed

B. Decimal 11, Binary 1100, Hexadecimal D

D.

A. Customers’ personal information could be compromised if an unauthorized individual gains access to the call session database.

C. b

B. Algorithm I always works correctly, but Algorithm II only works correctly when the maximum value is not the first value in the list.

B. 8

D. 5

B

B. 3y

A. top-down development

C. 10

D

B. FindName (["Andrea", "Ben" ], "Diane" )

C. strB.compareTo(strC) < 0 && strB.compareTo(strA) > 0

C. The procedure returns true when the initial value of number is even, and it otherwise returns false.

B. nextAvailableID

C. 250

D.

A. ['Hi!', 'Hi!', 'Hi!', 'Hi!']

C. Terabyte ( 2^40 or approximately 10^12 bytes)

C.

B. Chemistry

C. The number of computers launching the attack

A. A group of cookies stored by the user's Web browser

B. An Internet Protocol (IP) address is assigned to the device.

B

C. {1,2,3,4}

B. Which colors are more popular among men than women

C. 256

C. (int) (Math.random() * (high - low + 1)) + low

C. [11,15,19]

D. During run time

D.

D. 3 2

B. Determining the likelihood that the photo was taken at a particular public event

B. Chemistry

D.

C.

[{'metric_score': 1.0000000000000002,
  'metric_reason': 'The actual output correctly identifies that x << 3 results in 16, as indicated by option D.'},
 {'metric_score': 1.0,
  'metric_reason': "The actual output correctly identifies 'int(x [,base])' which matches the expected answer."},
 {'metric_score': 1.0,
  'metric_reason': 'The actual output correctly identifies that the file is broken into packets for transmission and needs to be reassembled upon receipt, which matches the expected answer.'},
 {'metric_score': 0.0,
  'metric_reason': "The Actual Output 'A' does not indicate a lossless transformation, as compressing the image may lose information."},
 {'metric_score': 1.0,
  'metric_reason': "The actual output correctly identifies 'B. How many programming statements the program contains' as the least likely consideration affecting the program's ability to process larger data sets."},
 {'metric_score': 0.030572072589472032,
  'metric_reason': "The actual output 'D. 256' is incorr

In [51]:
import statistics

len(test_result), test_result
avg = statistics.mean([eval['metric_score'] for eval in test_result])
print(f"모델: ChatOpenAI\nGEval 평가 평균: {avg}")

모델: ChatOpenAI
GEval 평가 평균: 0.7908556096172605
