In [1]:
import datasets
import json
import os
import math

In [2]:
# Change these variables to perform evalution for you task
original_quiz_path = os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'processed_data/gpt3/completion_4/processed_test.json'
question_path = os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'generated_data_gpt3/SWQG/generated_questions.json'
answer_path = os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'generated_data_gpt3/SWQG/generated_answers.json'
distractor_path = os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'generated_data_gpt3/SWQG/generated_distractors.json'

In [3]:
# Get original quizzes
original_quizzes = []

for line in open(original_quiz_path):
    original_quizzes.append(json.loads(line)['completion'].split("\n###")[0].replace("\n"," ").strip())
    
# Get generated questions
questions = []

with open(question_path) as f:
    generated = json.load(f)

for key in generated:
    questions.append(generated[key])
    
# Get generated answers
answers = []

with open(answer_path) as f:
    generated = json.load(f)

for key in generated:
    answers.append(generated[key])
    
# Get generated distractors
distractors = []

with open(distractor_path) as f:
    generated = json.load(f)

for key in generated:
    distractors.append(generated[key])

generated_quizzes = []

for i in range(len(questions)):
    quiz = questions[i].strip() + " True answer: " + answers[i].split("Answer: ")[1].strip() + " " + distractors[i].replace("\n"," ").strip()
    generated_quizzes.append(quiz)

In [4]:
# Put the predictions and gold references in lists

predictions = []
gold_references = []
predictions_list = []
gold_references_list = []

for i in range(len(original_quizzes)):
    predictions.append(generated_quizzes[i])
    gold_references.append(original_quizzes[i])

    predictions_list.append(generated_quizzes[i].split(' '))
    gold_references_list.append([original_quizzes[i].split(' ')])

In [5]:
bleu = datasets.load_metric('bleu')
rouge = datasets.load_metric('rouge')



In [6]:
bleu.add_batch(predictions=predictions_list, references=gold_references_list)
rouge.add_batch(predictions=predictions, references=gold_references)

In [7]:
final_bleu = bleu.compute()
final_rouge = rouge.compute()

In [8]:
# Define path of downloaded meteor from https://www.cs.cmu.edu/~alavie/METEOR/
meteor_path = "<PATH_TO_YOUR_DOWNLOADED_METEOR>"

# Move results to meteor directory
with open(meteor_path + "predictions.txt", 'w') as f:
    for i in range(len(predictions)):
        f.write(str(predictions[i]) + '\n')
        
with open(meteor_path + "ground_truth.txt", 'w') as f:
    for i in range(len(gold_references)):
        f.write(str(gold_references[i]) + '\n')

# Run the meteor command from the meteor directory and remove result files again   
wd = os.getcwd()
os.chdir(meteor_path)
output = os.popen("java -Xmx2G -jar meteor-*.jar predictions.txt ground_truth.txt -l en -norm").read()
os.remove(meteor_path + "predictions.txt")
os.remove(meteor_path + "ground_truth.txt")
os.chdir(wd)

# Get the score from the output
meteor_score = round(float(output.split("Final score:")[1].strip()) * 100, 2)

In [9]:
print("BLEU: ", str(round(final_bleu['bleu'] * 100, 2)))
print("ROUGE-L: ", str(round(final_rouge['rougeL'].mid.fmeasure * 100, 2)))
print("METEOR: ", str(meteor_score))

BLEU:  9.67
ROUGE-L:  34.92
METEOR:  24.88


In [10]:
# Create result files
with open(os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'generated_data_gpt3/SWQG/' + 'automatic_evaluation_SWQG.txt', 'w') as f:
    f.write("BLEU: " + str(round(final_bleu['bleu'] * 100, 2)))
    f.write('\n')
    f.write("ROUGE-L: " + str(round(final_rouge['rougeL'].mid.fmeasure * 100, 2)))
    f.write('\n')
    f.write("METEOR: " + str(meteor_score))

In [11]:
# Get all the target prompts
test_data = []

for line in open(original_quiz_path):
    test_data.append((json.loads(line)))

count = 0
with open(os.path.abspath(os.getcwd()).split('gpt3_evaluation_scripts')[0] + 'human_evaluation/SWQG/' + 'SWQG_gpt3.txt', 'w') as f:
    for i in range(0,len(test_data),math.floor(len(test_data)/100)):
        f.write('Excel row: ' + str(count+2) + ' Test instance: ' + str(i+1) + '\n\n')
        f.write(test_data[i]['prompt'].split("\n\n###")[0] + '\n\n')
        f.write('Generated quiz:\n')
        f.write(questions[i].strip() + '\n')
        f.write(answers[i].strip() + '\n')
        f.write(distractors[i].strip() + '\n\n')
        f.write('----------------------------------------------------------------------------------------' + '\n\n')
        count+=1
        
        if count == 100:
            break