In [110]:
import math
import openai
import os
import sklearn.metrics
import random
import time
from langchain.llms import OpenAI

In [125]:
os.environ["OPENAI_API_KEY"] = "sk-xxxx"
openai.api_key = "sk-"
random.seed(0)

In [126]:
from task_utils import TASKS, load_data, load_prompt, generate_prompts

In [1]:
## your LLM stack goes here


# example of inference function for penai models
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
GPT_TURBO = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5, max_tokens=600)

def call_llm_openai(prompt):
    output = GPT_TURBO([HumanMessage(content=prompt)])
    return output.content


# example of inference function for modal hosted models
import requests

def call_llm_modal(prompt):
    r = requests.post('https://xxxxxxxxx.modal.run', json={'question': prompt})
    output_dict = r.json()
    output = output_dict['output']
    completion = output[len(prompt):].strip()
    return completion


# example of inference function for baseten hosted models
import baseten
MODEL = baseten.deployed_model_version_id("ZBMDm4q")

def call_llm_baseten(prompt):
    output = MODEL.predict({"prompt": prompt, "do_sample": True, "max_new_tokens": 300})
    completion = output['data']['generated_text'][len(prompt):].strip()
    return completion


# submission for finetuned openai model
import openai

def call_llm_finetuned_openai(prompt):
    try:
        output = openai.Completion.create(
                    model="davinci:ft-personal-2023-06-27-15-35-52",
                    prompt=prompt,
                    max_tokens=5,
                    temperature=0.0,
                    top_p=1.0,
                    frequency_penalty=0.5,
    )
        return output.choices[0].text
    except:
        print("Token is too long.")
        return ""
    


# submission for open source finetuned baseten model
from baseten.models import FlanT5
# baseten.login("yiVOous9.2mjkPJSPdya6FUFbGKmXaLyUF6ZxTQYs")

model = FlanT5(model_id="ZBMDm4q")

gen_kwargs = {
  "temperature": 0,
  "repetition_penalty": 1.2,
}

def call_llm_finetuned_baseten(prompt):
    output =  model(
        prompt,
        max_new_tokens=512,
        early_stopping=True,
        **gen_kwargs
    )
    return output[0]

### Config below which LLM to Use

call_llm = call_llm_finetuned_openai


ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [None]:
def evaluate(tasks: list, tasks_dir: str):
    report = dict()
    for task in tasks:
        train_df, test_df = load_data(task=task, tasks_dir=tasks_dir)
        prompt_template = load_prompt(prompt_name="base_prompt.txt", task=task, tasks_dir=tasks_dir)
        prompts = generate_prompts(prompt_template=prompt_template, data_df=train_df)
        report[task] = dict()
        targets = list()
        outputs = list()
        print('task', task)
        for prompt, data in zip(prompts, train_df.iterrows()):
            datapoint_id, data = data
            output = call_llm(prompt)
            output = output.strip()
            targets.append(data['answer'])
            outputs.append(output)
            success = output == data['answer']
            report[task][datapoint_id] = {
                'prompt': prompt,
                'generated_output': output,
                'correct_output': data['answer'],
                'success': output == data['answer']
            }
        report[task]['balanced_accuracy'] = sklearn.metrics.balanced_accuracy_score(targets, outputs)
        print('task balanced accuracy:', report[task]['balanced_accuracy'])
        print()
    
    print('Total Balanced Accuracy:', sum([report[task]['balanced_accuracy'] if not math.isnan(report[task]['balanced_accuracy']) else 0 for task in tasks])/len(tasks))
    
    return report


In [None]:
tasks_dir = '../legalbench'

# warnings are to be expected

random.seed(1)
report = evaluate(tasks=random.sample(TASKS, 5), tasks_dir=tasks_dir)

task maud_change_in_law:__subject_to_"disproportionate_impact"_modifier


INFO:openai:error_code=None error_message="This model's maximum context length is 2049 tokens, however you requested 2491 tokens (2486 in your prompt; 5 for the completion). Please reduce your prompt; or completion length." error_param=None error_type=invalid_request_error message='OpenAI API error received' stream_error=False


task balanced accuracy: 0.0

task maud_fiduciary_exception:__board_determination_standard
task balanced accuracy: 0.0

task contract_nli_notice_on_compelled_disclosure
task balanced accuracy: 0.0

task diversity_6
task balanced accuracy: 0.0

task opp115_third_party_sharing_collection
task balanced accuracy: 0.0

task opp115_data_retention
task balanced accuracy: 0.0

task maud_cor_standard_(superior_offer)
task balanced accuracy: 0.0

task learned_hands_crime


INFO:openai:error_code=None error_message="This model's maximum context length is 2049 tokens, however you requested 2091 tokens (2086 in your prompt; 5 for the completion). Please reduce your prompt; or completion length." error_param=None error_type=invalid_request_error message='OpenAI API error received' stream_error=False


task balanced accuracy: 0.0

task maud_tail_period_length
task balanced accuracy: 0.0

task maud_"financial_point_of_view"_is_the_sole_consideration
task balanced accuracy: 0.0

task supply_chain_disclosure_best_practice_certification
task balanced accuracy: 0.0

task cuad_uncapped_liability
task balanced accuracy: 0.0

task opp115_policy_change


task balanced accuracy: 0.0

task cuad_irrevocable_or_perpetual_license
