In [1]:
import os
import json
import pickle
import numpy as np
import datetime

In [2]:
autocast_questions = json.load(open('autocast_questions.json', encoding='utf-8')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json', encoding='utf-8'))
test_ids = [q['id'] for q in test_questions]

In [3]:
subset_questions = []
tf = 0
mc = 0
nm = 0
for question in autocast_questions:
    if question["qtype"] == "t/f" and question["answer"] is not None and tf < 400:
        subset_questions.append(question)
        tf += 1
    if question["qtype"] == "mc" and question["answer"] is not None and mc < 400:
        subset_questions.append(question)
        mc += 1
    if question["qtype"] == "num" and question["answer"] is not None and nm < 400:
        subset_questions.append(question)
        nm += 1

In [4]:
all_score_num = 0
for question in autocast_questions:
    if question["qtype"] == "num" and question["answer"]:
        all_score_num += float(question["answer"])
print(len(autocast_questions))
print(all_score_num / len(autocast_questions))

6532
0.05330348025635245


## Create baseline models outputting random answers
# (This is useless now)

In [5]:
def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-2
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-4
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.415

## GPT3 Model

In [139]:
import openai
openai.api_key = "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"
epsilone = 0.01
def gpt35_model(question):
    # model = "text-curie-001" # T/F: 43.02, MCQ: 70.47, NUM: 26.99 // not good at all 
    # model = "davinci" # T/F: 27.53, MCQ: 37.99, NUM: 22.63 // basically random
    # model = "text-davinci-002" # T/F: 26.92，MCQ: 58.45，NUM: 34.24 // answer 95% all questions in correct form, but not often correct
    model = "text-davinci-003" # // answer 99% questions in correct form, but not often correct
    tf_question = "You can only answer yes or no, "
    mc_question = "You can only answer one singular character as the choice, "
    nm_question = "You can only answer with one singular confidence number between 0 and 1, "
    tags = "This question is related to " + " and ".join(question["tags"]) + ". "
    background = "The background is " + question["background"]
    alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    if question['qtype'] == 't/f':
        # response = openai.Completion.create(model=model, prompt=tf_question+tags+question["question"], temperature=2, max_tokens=3)
        # ans = response.choices[0].text[2:]
        # if "no" in ans.lower():
        #     return np.array([0.5+epsilone/10, 0.5-epsilone/10])
        # elif "yes" in ans.lower():
        #     return np.array([0.5-epsilone/10, 0.5+epsilone/10])
        # else:
        return np.array([0.5, 0.5])
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        preds = np.ones(len(question['choices']))
        try:
            choices = []
            i = 0
            while len(choices) != len(question["choices"]):
                choices.append(alphabets[i] + ": " + list(question["choices"])[i])
                i+=1
            response = openai.Completion.create(model=model, prompt=mc_question+tags+question["question"]+" Your choices are "+str(choices), temperature=0, max_tokens=3, best_of=1)
            ans = response.choices[0].text[2:]
            if ans.upper() in alphabets and alphabets.index(ans.upper()) < len(choices):
                preds[alphabets.index(ans.upper())] += 2.5
                return preds / preds.sum()
            else:
                preds[pred_idx] += 1e-4
                return preds / preds.sum()
        except:
            preds[pred_idx] += 0.06
            return preds / preds.sum()
    elif question['qtype'] == 'num':
        return 0.415

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [6]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

print(brier_score(np.array([1, 0, 0, 0]), np.array([0, 0, 1, 0])))
print(brier_score(np.array([0.25, 0.25, 0.25, 0.25]), np.array([0, 0, 1, 0])))
print(brier_score(np.array([0.24, 0.28, 0.24, 0.24]), np.array([0, 0, 1, 0])))
print(brier_score(np.array([0, 0.1, 0.8, 0.1]), np.array([0, 0, 1, 0])))
print(brier_score(np.array([0, 0.0005, 0.999, 0.0005]), np.array([0, 0, 1, 0])))

1.0
0.375
0.3856
0.029999999999999992
7.500000000000009e-07


In [188]:
preds = []
answers = []
qtypes = []
cutoff_date = datetime.datetime(2021, 9, 30)
correct = 0
for question in subset_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    if datetime.datetime.fromisoformat(question["close_time"][:-6]) > cutoff_date:
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        if np.argmax(preds[-1]) == ans_idx:
            correct += 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)
print(correct / qtypes.count("mc") * 100)

27.500000000000004


## Evaluate the model

In [189]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.01, MCQ: 37.59, NUM: 21.91
Combined Metric: 84.51


## Make predictions on test set

In [7]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [8]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

'zip' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.
