In [1]:
from pydantic import Field
from deepeval.dataset import Golden
from deepeval.test_case import LLMTestCase

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from benchmarq.utility import Evaluator
from benchmarq.experiment import Experiment

from dotenv import load_dotenv

load_dotenv()

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

# noinspection PyPep8Naming
def generate(prompt: str) -> str:
    return generator(prompt)


class Test(Evaluator):
    def __init__(self):
        super().__init__()

    def evaluate_consumption(self, input: Golden):
        generate(input.input)

    def evaluate_test_case(self, input: Golden) -> LLMTestCase:
        output = generate(f"{input.context}: {input.input}")[0]["generated_text"]
        print(output)
        return LLMTestCase(input=input.input, expected_output=input.expected_output, actual_output=output, context=input.context, retrieval_context=input.retrieval_context)

experiment = Experiment(
    subquestion_id="test",
    subquestion_path="experiments/test/tests.json",
    name="name",
    description="A very long description",
    settings=Test())

a=experiment.run()


Device set to use mps:0
[codecarbon INFO @ 15:58:26] [setup] RAM Tracking...
[codecarbon INFO @ 15:58:26] [setup] GPU Tracking...
[codecarbon INFO @ 15:58:26] No GPU found.
[codecarbon INFO @ 15:58:26] [setup] CPU Tracking...
[codecarbon INFO @ 15:58:27] CPU Model on constant consumption mode: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
[codecarbon INFO @ 15:58:27] >>> Tracker's metadata:
[codecarbon INFO @ 15:58:27]   Platform system: macOS-15.3.1-x86_64-i386-64bit
[codecarbon INFO @ 15:58:27]   Python version: 3.11.0
[codecarbon INFO @ 15:58:27]   CodeCarbon version: 2.2.2
[codecarbon INFO @ 15:58:27]   Available RAM : 16.000 GB
[codecarbon INFO @ 15:58:27]   CPU count: 16
[codecarbon INFO @ 15:58:27]   CPU model: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
[codecarbon INFO @ 15:58:27]   GPU count: None
[codecarbon INFO @ 15:58:27]   GPU model: None
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation

[]: Something went wrong. There was something wrong with our system. It was not helping. (pause)

[19:03] Guest: How could we possibly tell that from scratch if it's a separate operating system?

[
[]: Lucy is the girl who left when the girls woke up to find the city blocked by zombies. She is also the key to the world's map that unlocks the city on a first look.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating 2 test case(s) in parallel: |██████████|100% (2/2) [Time Taken: 00:10,  5.15s/test case]



Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0017986207395942124, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output and expected output have completely unrelated content with no factual alignment, failing all evaluation steps., error: None)
  - ❌ Succinctness (GEval) (score: 0.002197388505878706, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The actual output contains multiple sentences instead of exactly one, failing to meet the criteria., error: None)

For test case:

  - input: Something
  - actual output: []: Something went wrong. There was something wrong with our system. It was not helping. (pause)

[19:03] Guest: How could we possibly tell that from scratch if it's a separate operating system?

[
  - expected output: in the way she moves.
  - context: []
  - retrieval context: []


Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output




In [None]:
import os
import pandas as pd

a=pd.read_csv("/Users/leenlaptop/Documents/repos/greenai/green.ai/experiments/test/inputs.csv")

print(a)

In [2]:
experiment.create_subquestion_json()

{'subquestion_id': 'test',
 'subquestion_metrics_path': 'experiments/test/tests.json',
 'experiments': [{'id': '8c30631fad934d5da404ad35ac98468e',
   'name': 'name',
   'description': 'A very long description',
   'settings': {},
   'runs': [{'consumption_results': {'cloud_provider': '',
      'cloud_region': '',
      'codecarbon_version': '2.2.2',
      'country_iso_code': 'NLD',
      'country_name': 'The Netherlands',
      'cpu_count': 16,
      'cpu_energy': 0.00010849921256303787,
      'cpu_model': 'Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz',
      'cpu_power': 22.5,
      'duration': 17.361709117889404,
      'emissions': 4.551006183701586e-05,
      'emissions_rate': 2.621289271004002e-06,
      'energy_consumed': 0.00013742910931507747,
      'experiment_id': '1',
      'gpu_count': None,
      'gpu_energy': 0,
      'gpu_model': None,
      'gpu_power': 0.0,
      'latitude': 51.5542,
      'longitude': 5.0661,
      'on_cloud': 'N',
      'os': 'macOS-15.3.1-x86_64-i386-64b

In [None]:
a.toJSON()