In [1]:
import os
import json
import pickle
import numpy as np

In [2]:
autocast_questions = json.load(open('autocast_questions.json', encoding='utf-8')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json', encoding='utf-8'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers
# (This is useless now)

In [3]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.5

## GPT3 Model

In [3]:
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]
def gpt35_model(question):
    model = "text-curie-001" # T/F: 43.02, MCQ: 70.47, NUM: 26.99
    # model = "davinci" # T/F: 27.53, MCQ: 37.99, NUM: 22.63
    # model = "text-davinci-002" # T/F: 26.92，MCQ: 58.45，NUM: 34.24
    # model = "text-davinci-003" # Error
    tf_question = "You can only answer yes or no, "
    mc_question = "You can only answer one singular character as the choice, "
    nm_question = "You can only answer with one singular confidence number between 0 and 1. "
    alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    if question['qtype'] == 't/f':
        response = openai.Completion.create(model=model, prompt=tf_question+question["question"], temperature=0, max_tokens=3)
        ans = response.choices[0].text[2:]
        if "no" in ans.lower():
            return np.array([1, 0])
        elif "yes" in ans.lower():
            return np.array([0, 1])
        else:
            return np.array([0.5, 0.5])
    elif question['qtype'] == 'mc':
        try:
            choices = []
            i = 0
            while len(choices) != len(question["choices"]):
                choices.append(alphabets[i] + ": " + list(question["choices"])[i])
                i+=1
            response = openai.Completion.create(model=model, prompt=mc_question+question["question"]+" Your choices are "+str(choices), temperature=0, max_tokens=3)
            ans = response.choices[0].text[2:]
            preds = np.zeros(len(choices))
            if ans.upper() in alphabets and alphabets.index(ans.upper()) < len(choices):
                preds[alphabets.index(ans.upper())] = 1
                return preds
            else:
                pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
                pred = np.ones(len(question['choices']))
                pred[pred_idx] += 1e-5
                return pred / pred.sum()
        except:
            pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
            pred = np.ones(len(question['choices']))
            pred[pred_idx] += 1e-5
            return pred / pred.sum()
    elif question['qtype'] == 'num':
        response = openai.Completion.create(model=model, prompt=question["question"]+nm_question, temperature=0, max_tokens=30)
        try:
            ans = response.choices[0].text[2:]
            if float(ans) < 0:
                return 0
            elif 0 <= float(ans) <= 1:
                return float(ans)
            elif float(ans) > 1:
                return 1
        except:
            return 0.5

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [26]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

print(brier_score(np.array([0.25, 0.25, 0.25, 0.25]), np.array([0, 0, 1, 0])))
print(brier_score(np.array([0, 0.1, 0.8, 0.1]), np.array([0, 0, 1, 0])))

0.375
0.029999999999999992


In [27]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(gpt35_model(question))
    if len(preds) % 100 == 0:
        print(len(preds))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700


## Evaluate the model

In [28]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 43.02, MCQ: 70.47, NUM: 26.99
Combined Metric: 140.48


## Make predictions on test set

In [33]:
nc = 0
for question in test_questions:
    try:
        if len(list(question["choices"])) > 26:
            nc +=1
    except:
        continue
print(nc)

4


In [34]:
preds = []
for question in test_questions:
    preds.append(gpt35_model(question))

In [35]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

'zip' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
!ls

autocast_competition_test_set.json [36msubmission[m[m
autocast_questions.json            submission.zip
example_submission.ipynb
