In [1]:
import math
import openai
import os
import sklearn.metrics
import random
import time
from langchain.llms import OpenAI

In [30]:
os.environ["OPENAI_API_KEY"] = "sk-"
openai.api_key = "sk-"
random.seed(0)

In [31]:
from task_utils import TASKS, load_data, load_prompt, generate_prompts

In [35]:
## your LLM stack goes here


# example of inference function for penai models
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
GPT_TURBO = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5, max_tokens=600)

def call_llm_openai(prompt):
    output = GPT_TURBO([HumanMessage(content=prompt)])
    return output.content


# example of inference function for modal hosted models
import requests

def call_llm_modal(prompt):
    r = requests.post('https://xxxxxxxxx.modal.run', json={'question': prompt})
    output_dict = r.json()
    output = output_dict['output']
    completion = output[len(prompt):].strip()
    return completion


# example of inference function for baseten hosted models
import baseten
MODEL = baseten.deployed_model_version_id("ZBMDm4q")

def call_llm_baseten(prompt):
    output = MODEL.predict({"prompt": prompt, "do_sample": True, "max_new_tokens": 300})
    completion = output['data']['generated_text'][len(prompt):].strip()
    return completion


# submission for finetuned openai model
import openai

def call_llm_finetuned_openai(prompt):
    try:
        GPT_FINE = ChatOpenAI(model_name="gpt-4", temperature=0, max_tokens=200)
        output = GPT_FINE([HumanMessage(content=prompt)])
        return output.content
    #     output = openai.ChatCompletion.create(
    #                 model="gpt-4",
    #                 messages=[{"role": "user", "content": prompt}],
    #                 max_tokens=5,
    #                 temperature=0.0,
    #                 top_p=1.0,
    #                 frequency_penalty=0.5,
    # )
        # return output.choices[0].text
    except Exception as e:
        print(f"Error: {e}")
        return ""
    

# submission for open source finetuned baseten model
from baseten.models import FlanT5
baseten.login("yiVOous9.2mjkPJSPdya6FUFbGKmXaLyUF6ZxTQYs")

model = FlanT5(model_id="ZBMDm4q")

gen_kwargs = {
  "temperature": 0,
  "repetition_penalty": 1.2,
}

def call_llm_finetuned_baseten(prompt):
    output =  model(
        prompt,
        max_new_tokens=512,
        early_stopping=True,
        **gen_kwargs
    )
    return output[0]

### Config below which LLM to Use

call_llm = call_llm_finetuned_openai


In [36]:
def evaluate(tasks: list, tasks_dir: str):
    report = dict()
    for task in tasks:
        train_df, test_df = load_data(task=task, tasks_dir=tasks_dir)
        prompt_template = load_prompt(prompt_name="base_prompt.txt", task=task, tasks_dir=tasks_dir)
        prompts = generate_prompts(prompt_template=prompt_template, data_df=train_df)
        report[task] = dict()
        targets = list()
        outputs = list()
        print('task', task)
        for prompt, data in zip(prompts, train_df.iterrows()):
            datapoint_id, data = data
            output = call_llm(prompt)
            output = output.strip()
            targets.append(data['answer'])
            outputs.append(output)
            success = output == data['answer']
            report[task][datapoint_id] = {
                'prompt': prompt,
                'generated_output': output,
                'correct_output': data['answer'],
                'success': output == data['answer']
            }
        report[task]['balanced_accuracy'] = sklearn.metrics.balanced_accuracy_score(targets, outputs)
        print('task balanced accuracy:', report[task]['balanced_accuracy'])
        print()
    
    print('Total Balanced Accuracy:', sum([report[task]['balanced_accuracy'] if not math.isnan(report[task]['balanced_accuracy']) else 0 for task in tasks])/len(tasks))
    
    return report


In [37]:
tasks_dir = '../legalbench'

# warnings are to be expected

random.seed(1)
report = evaluate(tasks=random.sample(TASKS, 5), tasks_dir=tasks_dir)

task cuad_ip_ownership_assignment
task balanced accuracy: 1.0

task sara_numeric




task balanced accuracy: 0.0

task contract_nli_sharing_with_employees
task balanced accuracy: 1.0

task diversity_5
task balanced accuracy: 0.875

task cuad_exclusivity
task balanced accuracy: 1.0

Total Balanced Accuracy: 0.775


In [38]:
report

{'cuad_ip_ownership_assignment': {0: {'prompt': 'Does intellectual property created by one party become the property of the counterparty, either per the terms of the contract or upon the occurrence of certain events?\n\nClause: However, to the extent that any Work may not, by operation of any Laws, be a work made for hire, MD Anderson hereby assigns, transfers and conveys to LBIO all of MD Anderson\'s worldwide right, title and interest in and to such Work, including all Intellectual Property Rights therein and relating thereto, subject to MD Anderson\'s right to use such Work for internal research, academic, and non-commercial patient care purposes prior to publication or public disclosure.\nLabel: Yes\n\nClause: If Parent or SpinCo is unable to obtain, or to cause to be obtained, any such required consent, substitution, approval, amendment or release as set forth in Section 2.4(a) and the applicable member of the Parent Group continues to be bound by such agreement, lease, license or