In [2]:
import os
import json
import pickle
import numpy as np
import openai
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

openai.api_key = "fake_key

# Use a more suitable pre-trained model for the task
"""
model_name = "allenai/unifiedqa-v2-t5-large-1251000" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = model.generate(input_ids, **generator_args)
    return tokenizer.batch_decode(res, skip_special_tokens=True)
"""
def run_gpt(input_string):
    # davinci:ft-personal-2023-03-26-20-59-06
    output = openai.Completion.create(engine="text-davinci-003",
                                      prompt=input_string,
                                      max_tokens=256,
                                      temperature=0,
                                      top_p=0)['choices'][0]['text']
    return output


In [4]:
autocast_questions = json.load(open('data/autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('data/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers

In [12]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        new_question = question['question'].replace("Will", "Did").replace("will",'did')
        question_str = new_question + 'please give a true/false answer. Do not explain your result! '
        #question_uni = question['question'] + '\n (A) true (B) false'
        #model1_prediction = run_model(question_uni)
        model2_prediction = run_gpt(question_str)

        #ensemble_prediction = np.argmax([model1_prediction, model2_prediction], axis=0)
        pred = np.ones(2)
        if 'false' in model2_prediction.lower():
            pred[1]+=1e-3
        else :
            pred[0]+=1e-3
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        if(len(question['choices'])) > 26:
            pred = np.ones(len(question['choices']))
            return pred / pred.sum()
        print(question['choices'])
        question_str = question['question'].replace("Will", "Did").replace("will",'did')
        choice_letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
        for i, choice in enumerate(question['choices']):
            question_str += "(" + f"{choice_letters[i]})  {choice} "
        question_str += '. Only give the index or your answer.'
        #model1_prediction = run_model(question_str)
        model2_prediction = run_gpt(question_str)
        
        #ensemble_prediction = np.argmax([model1_prediction, model2_prediction], axis=0)
        
        pred = np.ones(len(question['choices']))

        # lowercase_choices = [string.lower() for string in question['choices']]
        for i, choice in enumerate(choice_letters):
            if i >= len(question['choices']):
                break
            if choice in model2_prediction:
                pred[i] +=1
                break
        if pred.sum() == 0:
            pred = np.ones(len(question['choices']))
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.4
    
    """
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-2
        return pred / pred.sum()
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1
        return pred / pred.sum()
    elif question['qtype'] == 'num':
        return 0.40
    """

    

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [4]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

TRaining


In [5]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1e-5
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

## Evaluate the model

In [177]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 39.41, NUM: 21.22
Combined Metric: 85.63


## Make predictions on test set

In [13]:
import time
#preds = []
for index, question in enumerate( test_questions):
    #if question['qtype'] == 't/f':
    #    txt += 'With background: ' + question['background'] + '. my question is :' + question['question'] + '\n'
    if index < 267:
        continue
    time.sleep(2)
    preds.append(calibrated_random_baseline_model(question))
    print(index)

['Lower than $220.00/MT', 'Between $220.00/MT and $260.00/MT, inclusive', 'Higher than $260.00/MT but lower than $300.00/MT', 'Between $300.00/MT and $340.00/MT, inclusive', 'Higher than $340.00/MT']
267
268
269
['The two weeks ending 3 July 2021 or earlier', 'The two weeks ending either 17 July 2021 or 31 July 2021', 'The two weeks ending either 14 August 2021 or 28 August 2021', 'The two weeks ending 11 September 2021 or 25 September 2021', 'Not before 26 September 2021']
270
['On or before 14 July 2021', 'Between 15 July and 28 July 2021', 'Between 29 July and 11 August 2021', 'Not before 12 August 2021']
271
['Less than 1.0%', 'Between 1.0% and 2.5%, inclusive', 'More than 2.5% but less than 4.0%', 'Between 4.0% and 5.5%, inclusive', 'More than 5.5%']
272
273
274
275
276
277
278
279
280
281
282
['Fewer than 60,000', 'Between 60,000 and 80,000, inclusive', 'More than 80,000 but fewer than 120,000', 'Between 120,000 and 240,000, inclusive', 'More than 240,000']
283
284
285
['Jeff Bez

In [14]:
if not os.path.exists('submission'):
    os.makedirs('submission')

print(len(preds))
with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

1364
updating: predictions.pkl (deflated 80%)


In [201]:
print(len(preds))

929
