In [181]:
import os
import json
import pickle
import numpy as np
import openai

openai.api_key = "key"

from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "allenai/unifiedqa-v2-t5-large-1251000" # you can specify the model size here
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = model.generate(input_ids, **generator_args)
    return tokenizer.batch_decode(res, skip_special_tokens=True)

def run_gpt(input_string):
    output = openai.Completion.create(engine="text-davinci-002",
                                                       prompt=input_string,
                                                       max_tokens=256,
                                                       temperature=0,
                                                       top_p=0)['choices'][0]['text']
    return output


In [99]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers

In [206]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        question_str = question['question'] + ' (a) true (b) false. Don\' give explanation.'
        ans = run_gpt(question_str)
        pred = np.ones(2)
        if (ans == '\n\nfalse'):
            pred[1]+=1e-5
        else :
            pred[0]+=1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        if(len(question['choices'])) > 26:
            pred = np.ones(len(question['choices']))
            return pred / pred.sum()
        print(question['choices'])
        question_str = question['question']
        choice_letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
        for i, choice in enumerate(question['choices']):
            question_str += "(" + f"{choice_letters[i]})  {choice} "
        question_str += '. Only give the index or your answer.'
        ans = run_gpt(question_str)
        
        pred = np.ones(len(question['choices']))

        # lowercase_choices = [string.lower() for string in question['choices']]
        for i, choice in enumerate(choice_letters):
            if (i > len(question['choices'])):
                break
            if (choice in ans):
                pred[i] +=1e-5
                break
        if (pred.sum() == 0) :
            pred = np.ones(len(question['choices']))
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.4
    
    
    """
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-5
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.4
    
    """
    

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [162]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [192]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

['Less than 6.30', 'Between 6.30 and 6.35, inclusive', 'More than 6.35 but less than 6.40', '6.40 or more']
['A majority', 'A plurality', 'Not a plurality']


Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 1078, in _pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch
  File "_pydevd_bundle/pydevd_cython.pyx", line 297, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "/Users/ricardoli/opt/anaconda3/envs/pytorch_gpu/lib/python3.10/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 1976, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
  File "/Users/ricardoli/opt/anaconda3/envs/pytorch_gpu/lib/python3.10/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2011, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


KeyboardInterrupt: 

## Evaluate the model

In [170]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 38.05, NUM: 21.22
Combined Metric: 84.27


## Make predictions on test set

In [207]:
import time
preds = []
for question in test_questions:
    time.sleep(2)
    preds.append(calibrated_random_baseline_model(question))

['Fewer than 2.2 million', 'Between 2.2 million and 2.8 million, inclusive', 'More than 2.8 million but fewer than 3.4 million', 'Between 3.4 million and 4.0 million, inclusive', 'More than 4.0 million']


Traceback (most recent call last):
  File "_pydevd_bundle/pydevd_cython.pyx", line 1078, in _pydevd_bundle.pydevd_cython.PyDBFrame.trace_dispatch
  File "_pydevd_bundle/pydevd_cython.pyx", line 297, in _pydevd_bundle.pydevd_cython.PyDBFrame.do_wait_suspend
  File "/Users/ricardoli/opt/anaconda3/envs/pytorch_gpu/lib/python3.10/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 1976, in do_wait_suspend
    keep_suspended = self._do_wait_suspend(thread, frame, event, arg, suspend_type, from_this_thread, frames_tracker)
  File "/Users/ricardoli/opt/anaconda3/envs/pytorch_gpu/lib/python3.10/site-packages/debugpy/_vendored/pydevd/pydevd.py", line 2011, in _do_wait_suspend
    time.sleep(0.01)
KeyboardInterrupt


KeyboardInterrupt: 

In [203]:
if not os.path.exists('submission'):
    os.makedirs('submission')

print(len(preds))
with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

1364
updating: predictions.pkl (deflated 78%)


In [201]:
print(len(preds))

929
