In [1]:
import math
import openai
import os
import sklearn.metrics
import random
from langchain.llms import OpenAI

In [2]:
os.environ["OPENAI_API_KEY"] = "sk-xxxx"

In [3]:
from task_utils import TASKS, load_data, load_prompt, generate_prompts

In [4]:
## your LLM stack goes here


from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

GPT_TURBO = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5, max_tokens=600)


def call_llm(prompt):
    output = GPT_TURBO([HumanMessage(content=prompt)])
    
    return output.content
    

In [15]:
def evaluate(tasks: list, tasks_dir: str):
    report = dict()
    for task in tasks:
        train_df, test_df = load_data(task=task, tasks_dir=tasks_dir)
        prompt_template = load_prompt(prompt_name="base_prompt.txt", task=task, tasks_dir=tasks_dir)
        prompts = generate_prompts(prompt_template=prompt_template, data_df=train_df)
        report[task] = dict()
        targets = list()
        outputs = list()
        for prompt, data in zip(prompts, train_df.iterrows()):
            datapoint_id, data = data
            output = call_llm(prompt)
            output = output.strip()
            targets.append(data['answer'])
            outputs.append(output)
            success = output == data['answer']
            report[task][datapoint_id] = {
                'prompt': prompt,
                'generated_output': output,
                'correct_output': data['answer'],
                'success': output == data['answer']
            }
        report[task]['balanced_accuracy'] = sklearn.metrics.balanced_accuracy_score(targets, outputs)
    
    print('Balanced Accuracy:', sum([report[task]['balanced_accuracy'] if not math.isnan(report[task]['balanced_accuracy']) else 0 for task in tasks])/len(tasks))
    
    return report


In [16]:
tasks_dir = '../legalbench'

# warnings are to be expected

report = evaluate(tasks=random.sample(TASKS, 10), tasks_dir=tasks_dir)



Balanced Accuracy: 0.7579761904761904
