In [None]:
from openai import OpenAI
from datasets import load_dataset
# import json

api_key = "YOUR API KEY"

client = OpenAI(api_key=api_key)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hotpotqa_dataset = load_dataset('hotpotqa/hotpot_qa', 'fullwiki', trust_remote_code=True)
squad_dataset = load_dataset('rajpurkar/squad', split='validation')

In [3]:
# NQ -- but annoying to process
# file_path = "datasets/v1.0-simplified_nq-dev-all.jsonl"

# # Reading the file
# with open(file_path, "r", encoding="utf-8") as file:
#     for line in file:
#         # Parse the JSON line
#         data = json.loads(line)
#         # Print or process the parsed data
#         print(data)  # Replace this with your processing logic
#         question = data['question_text']
#         context = data['context']
#         answers = data['answers']['text']

#         break

# parameters

In [4]:
questions_num = 500
repeats = [1, 3, 5]
# models = ['gpt-4o-mini', "gpt-4o-mini-2024-07-18"]
models = ['gpt-4o-mini-2024-07-18']

# Global dictionary to save results
overall_results = {}

#  evaluate models

In [5]:
# Initialize results structure for each model with separate dataset contexts
for model in models:
    overall_results[model] = {
        "hotpotqa": {
            "with_context": {},
            "without_context": {}
        },
        "squad": {
            "with_context": {},
            "without_context": {}
        }
    }

# Evaluate HotPotQA with and without context
for model in models:
    # Process "with context" for hotpotqa
    for repeat in repeats:
        hotpotqa_results_w_context = []
        for idx, entry in enumerate(hotpotqa_dataset['validation']):
            if idx == questions_num:
                break
            question = entry['question']
            context_sentences = [" ".join(sentence) for sentence in entry['context']['sentences']]
            context = " ".join(context_sentences)
            answer = entry['answer']
            prompt = f'Context: {context} ' + f'Question: {question} ' * repeat + 'Answer: '
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": prompt}],
                temperature=0.0
            )
            hotpotqa_results_w_context.append([completion.choices[0].message.content, answer])
        
        corrects = sum(1 for r in hotpotqa_results_w_context if r[1].lower() in r[0].lower())
        
        # Save results specific to hotpotqa with context
        overall_results[model]['hotpotqa']['with_context'][f"Qx{repeat}"] = {
            "results": hotpotqa_results_w_context,
            "corrects": corrects,
            "accuracy": corrects / len(hotpotqa_results_w_context) if hotpotqa_results_w_context else 0
        }

    # Process "without context" for hotpotqa
    for repeat in repeats:
        hotpotqa_results_wo_context = []
        for idx, entry in enumerate(hotpotqa_dataset['validation']):
            if idx == questions_num:
                break
            question = entry['question']
            answer = entry['answer']
            prompt = f'Question: {question} ' * repeat + 'Answer: '
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": prompt}],
                temperature=0.0
            )
            hotpotqa_results_wo_context.append([completion.choices[0].message.content, answer])
        
        corrects = sum(1 for r in hotpotqa_results_wo_context if r[1].lower() in r[0].lower())

        # Save results specific to hotpotqa without context
        overall_results[model]['hotpotqa']['without_context'][f"Qx{repeat}"] = {
            "results": hotpotqa_results_wo_context,
            "corrects": corrects,
            "accuracy": corrects / len(hotpotqa_results_wo_context) if hotpotqa_results_wo_context else 0
        }

# Evaluate Squad with and without context
for model in models:
    # Process "with context" for squad
    for repeat in repeats:
        squad_results_w_context = []
        for idx, entry in enumerate(squad_dataset):
            if idx == questions_num:
                break
            question = entry['question']
            context = entry['context']
            answer = entry['answers']['text']  # This is a list of correct answers
            prompt = f'Context: {context} ' + f'Question: {question} ' * repeat + 'Answer: '
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": prompt}],
                temperature=0.0
            )
            squad_results_w_context.append([completion.choices[0].message.content, answer])
        
        # Correct responses calculation using your logic
        corrects = 0
        for r in squad_results_w_context:
            generated_ans, ans = r[0], r[1]  # r[0] is model-generated response, r[1] is the list of correct answers
            if any(item.lower() in generated_ans.lower() for item in ans):
                corrects += 1

        # Save results specific to squad with context
        overall_results[model]['squad']['with_context'][f"Qx{repeat}"] = {
            "results": squad_results_w_context,
            "corrects": corrects,
            "accuracy": corrects / len(squad_results_w_context) if squad_results_w_context else 0
        }

    # Process "without context" for squad
    for repeat in repeats:
        squad_results_wo_context = []
        for idx, entry in enumerate(squad_dataset):
            if idx == questions_num:
                break
            question = entry['question']
            answer = entry['answers']['text']  # This is a list of correct answers
            prompt = f'Question: {question} ' * repeat + 'Answer: '
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": prompt}],
                temperature=0.0
            )
            squad_results_wo_context.append([completion.choices[0].message.content, answer])

        # Correct responses calculation using your logic
        corrects = 0
        for r in squad_results_wo_context:
            generated_ans, ans = r[0], r[1]  # r[0] is model-generated response, r[1] is the list of correct answers
            if any(item.lower() in generated_ans.lower() for item in ans):
                corrects += 1

        # Save results specific to squad without context
        overall_results[model]['squad']['without_context'][f"Qx{repeat}"] = {
            "results": squad_results_wo_context,
            "corrects": corrects,
            "accuracy": corrects / len(squad_results_wo_context) if squad_results_wo_context else 0
        }


# results

In [6]:
# Loop over models to print results for each
for model_name, model_results in overall_results.items():
    print(f"Model: {model_name}")
    print()
    for dataset_name, settings in model_results.items():
        # Print context settings for each dataset
        print(f"# With context:")
        print(f"{dataset_name}:")
        print(f"  Standard accuracy (Qx1): {settings['with_context']['Qx1']['accuracy']:.2f}")
        print(f"  Repeat x3 accuracy (Qx3): {settings['with_context']['Qx3']['accuracy']:.2f}")
        print(f"  Repeat x5 accuracy (Qx5): {settings['with_context']['Qx5']['accuracy']:.2f}")
        print()
        # Print without context settings for each dataset
        print(f"# Without context:")
        print(f"{dataset_name}:")
        print(f"  Standard accuracy (Qx1): {settings['without_context']['Qx1']['accuracy']:.2f}")
        print(f"  Repeat x3 accuracy (Qx3): {settings['without_context']['Qx3']['accuracy']:.2f}")
        print(f"  Repeat x5 accuracy (Qx5): {settings['without_context']['Qx5']['accuracy']:.2f}")
        print()
    print()

Model: gpt-4o-mini-2024-07-18

# With context:
hotpotqa:
  Standard accuracy (Qx1): 0.58
  Repeat x3 accuracy (Qx3): 0.58
  Repeat x5 accuracy (Qx5): 0.59

# Without context:
hotpotqa:
  Standard accuracy (Qx1): 0.42
  Repeat x3 accuracy (Qx3): 0.42
  Repeat x5 accuracy (Qx5): 0.43

# With context:
squad:
  Standard accuracy (Qx1): 0.99
  Repeat x3 accuracy (Qx3): 0.99
  Repeat x5 accuracy (Qx5): 0.98

# Without context:
squad:
  Standard accuracy (Qx1): 0.49
  Repeat x3 accuracy (Qx3): 0.49
  Repeat x5 accuracy (Qx5): 0.49


