In [1]:
import json
import os
import nltk
import numpy as np
import subprocess

unseen_domains = ['Artist', 'Politician', 'CelestialBody', 'Athlete', 'MeanOfTransportation']
# root path
path='evaluation/'

# Evaluation of Ordering and Structuring

In [2]:
print('All domains:')
for _set in ['dev', 'test']:
    for task in ['ordering', 'structing']:
        gold_path=os.path.join(path, 'data', task, _set + '.json')
        gold = json.load(open(gold_path))

        for model in ['random', 'major', 'transformer', 'rnn']:
            p = os.path.join(path, 'results/steps', task, model, _set + '.out.postprocessed')
            with open(p) as f:
                y_pred_ = f.read().split('\n')[:-1]

            y_real, y_pred = [], []
            for i, g in enumerate(gold):
                t = [' '.join(target['output']) for target in g['targets']]
                y_real.append(t)
                y_pred.append(y_pred_[i].strip())

            num, dem = 0.0, 0
            for i, y_ in enumerate(y_pred):
                y = y_real[i]
                if y_.strip() in y:
                    num += 1
                dem += 1
            print('Task: ', task)
            print('Set: ', _set)
            print('Model: ', model)
            print('Accuracy: ', round(num/dem, 2))
            print(10 * '-')

All domains:
Task:  ordering
Set:  dev
Model:  random
Accuracy:  0.29
----------
Task:  ordering
Set:  dev
Model:  major
Accuracy:  0.54
----------
Task:  ordering
Set:  dev
Model:  transformer
Accuracy:  0.59
----------
Task:  ordering
Set:  dev
Model:  rnn
Accuracy:  0.63
----------
Task:  structing
Set:  dev
Model:  random
Accuracy:  0.26
----------
Task:  structing
Set:  dev
Model:  major
Accuracy:  0.49
----------
Task:  structing
Set:  dev
Model:  transformer
Accuracy:  0.65
----------
Task:  structing
Set:  dev
Model:  rnn
Accuracy:  0.67
----------
Task:  ordering
Set:  test
Model:  random
Accuracy:  0.31
----------
Task:  ordering
Set:  test
Model:  major
Accuracy:  0.48
----------
Task:  ordering
Set:  test
Model:  transformer
Accuracy:  0.34
----------
Task:  ordering
Set:  test
Model:  rnn
Accuracy:  0.35
----------
Task:  structing
Set:  test
Model:  random
Accuracy:  0.29
----------
Task:  structing
Set:  test
Model:  major
Accuracy:  0.27
----------
Task:  structing
Set:

In [3]:
print('Seen domains:')
for _set in ['dev', 'test']:
    for task in ['ordering', 'structing']:
        gold_path=os.path.join(path, 'data', task, _set + '.json')
        gold = json.load(open(gold_path))

        for model in ['random', 'major', 'transformer', 'rnn']:
            p = os.path.join(path, 'results/steps', task, model, _set + '.out.postprocessed')
            with open(p) as f:
                y_pred_ = f.read().split('\n')[:-1]

            y_real, y_pred = [], []
            for i, g in enumerate(gold):
                if g['category'] not in unseen_domains:
                    t = [' '.join(target['output']) for target in g['targets']]
                    y_real.append(t)
                    y_pred.append(y_pred_[i].strip())

            num, dem = 0.0, 0
            for i, y_ in enumerate(y_pred):
                y = y_real[i]
                if y_.strip() in y:
                    num += 1
                dem += 1
            print('Task: ', task)
            print('Set: ', _set)
            print('Model: ', model)
            print('Accuracy: ', round(num/dem, 2))
            print(10 * '-')

Seen domains:
Task:  ordering
Set:  dev
Model:  random
Accuracy:  0.29
----------
Task:  ordering
Set:  dev
Model:  major
Accuracy:  0.54
----------
Task:  ordering
Set:  dev
Model:  transformer
Accuracy:  0.59
----------
Task:  ordering
Set:  dev
Model:  rnn
Accuracy:  0.63
----------
Task:  structing
Set:  dev
Model:  random
Accuracy:  0.26
----------
Task:  structing
Set:  dev
Model:  major
Accuracy:  0.49
----------
Task:  structing
Set:  dev
Model:  transformer
Accuracy:  0.65
----------
Task:  structing
Set:  dev
Model:  rnn
Accuracy:  0.67
----------
Task:  ordering
Set:  test
Model:  random
Accuracy:  0.29
----------
Task:  ordering
Set:  test
Model:  major
Accuracy:  0.51
----------
Task:  ordering
Set:  test
Model:  transformer
Accuracy:  0.56
----------
Task:  ordering
Set:  test
Model:  rnn
Accuracy:  0.56
----------
Task:  structing
Set:  test
Model:  random
Accuracy:  0.29
----------
Task:  structing
Set:  test
Model:  major
Accuracy:  0.45
----------
Task:  structing
Set

In [4]:
print('Unseen domains:')
for _set in ['dev', 'test']:
    for task in ['ordering', 'structing']:
        gold_path=os.path.join(path, 'data', task, _set + '.json')
        gold = json.load(open(gold_path))

        for model in ['random', 'major', 'transformer', 'rnn']:
            p = os.path.join(path, 'results/steps', task, model, _set + '.out.postprocessed')
            with open(p) as f:
                y_pred_ = f.read().split('\n')[:-1]

            y_real, y_pred = [], []
            for i, g in enumerate(gold):
                if g['category'] in unseen_domains:
                    t = [' '.join(target['output']) for target in g['targets']]
                    y_real.append(t)

                    y_pred.append(y_pred_[i].strip())

            num, dem = 0.0, 0
            for i, y_ in enumerate(y_pred):
                y = y_real[i]
                if y_.strip() in y:
                    num += 1
                dem += 1
            print('Task: ', task)
            print('Set: ', _set)
            print('Model: ', model)
            print('Accuracy: ', round(num/dem, 2) if dem > 0 else 0)
            print(10 * '-')

Unseen domains:
Task:  ordering
Set:  dev
Model:  random
Accuracy:  0
----------
Task:  ordering
Set:  dev
Model:  major
Accuracy:  0
----------
Task:  ordering
Set:  dev
Model:  transformer
Accuracy:  0
----------
Task:  ordering
Set:  dev
Model:  rnn
Accuracy:  0
----------
Task:  structing
Set:  dev
Model:  random
Accuracy:  0
----------
Task:  structing
Set:  dev
Model:  major
Accuracy:  0
----------
Task:  structing
Set:  dev
Model:  transformer
Accuracy:  0
----------
Task:  structing
Set:  dev
Model:  rnn
Accuracy:  0
----------
Task:  ordering
Set:  test
Model:  random
Accuracy:  0.35
----------
Task:  ordering
Set:  test
Model:  major
Accuracy:  0.44
----------
Task:  ordering
Set:  test
Model:  transformer
Accuracy:  0.09
----------
Task:  ordering
Set:  test
Model:  rnn
Accuracy:  0.1
----------
Task:  structing
Set:  test
Model:  random
Accuracy:  0.3
----------
Task:  structing
Set:  test
Model:  major
Accuracy:  0.06
----------
Task:  structing
Set:  test
Model:  transfor

# Evaluation of Referring Expressions

In [5]:
print('All domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'reg', _set + '.json')
    gold = json.load(open(gold_path))

    p = os.path.join(path, 'results/steps/reg', _set + '.out.postprocessed')
    with open(p) as f:
        y_pred_ = f.read().split('\n')[:-1]

    y_real, y_pred, y_baseline = [], [], []
    for i, g in enumerate(gold):
        y_real.append(' '.join(g['refex']).strip().lower())
        y_pred.append(y_pred_[i].strip().lower())
    
    num, dem = 0.0, 0
    baseline = 0
    for i, y_ in enumerate(y_pred):
        refex = ' '.join(nltk.word_tokenize(gold[i]['entity'].replace('\'', ' ').replace('\"', ' ').replace('_', ' ')))
        y = y_real[i]
        if y_.strip() == y:
            num += 1
        if refex.strip().lower() == y:
            baseline += 1
        dem += 1
    print('REG Task:')
    print('Set: ', _set)
    print('Baseline Accuracy: ', round(baseline/dem, 2) if dem > 0 else 0)
    print('NeuralREG Accuracy: ', round(num/dem, 2) if dem > 0 else 0)
    print(10 * '-')

All domains:
REG Task:
Set:  dev
Baseline Accuracy:  0.54
NeuralREG Accuracy:  0.72
----------
REG Task:
Set:  test
Baseline Accuracy:  0.51
NeuralREG Accuracy:  0.39
----------


In [6]:
print('Seen domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'reg', _set + '.json')
    gold = json.load(open(gold_path))

    p = os.path.join(path, 'results/steps/reg', _set + '.out.postprocessed')
    with open(p) as f:
        y_pred_ = f.read().split('\n')[:-1]

    y_real, y_pred, y_baseline = [], [], []
    for i, g in enumerate(gold):
    #     if g['category'] in unseen_domains:
        y_real.append(' '.join(g['refex']).strip().lower())
        y_pred.append(y_pred_[i].strip().lower())
    
    num, dem = 0.0, 0
    baseline = 0
    for i, y_ in enumerate(y_pred):
        if gold[i]['category'] not in unseen_domains:
            refex = ' '.join(nltk.word_tokenize(gold[i]['entity'].replace('\'', ' ').replace('\"', ' ').replace('_', ' ')))
            y = y_real[i]
            if y_.strip() == y:
                num += 1
            if refex.strip().lower() == y:
                baseline += 1
            dem += 1
    print('REG Task:')
    print('Set: ', _set)
    print('Baseline Accuracy: ', round(baseline/dem, 2) if dem > 0 else 0)
    print('NeuralREG Accuracy: ', round(num/dem, 2) if dem > 0 else 0)
    print(10 * '-')

Seen domains:
REG Task:
Set:  dev
Baseline Accuracy:  0.54
NeuralREG Accuracy:  0.72
----------
REG Task:
Set:  test
Baseline Accuracy:  0.53
NeuralREG Accuracy:  0.7
----------


In [7]:
print('Unseen domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'reg', _set + '.json')
    gold = json.load(open(gold_path))

    p = os.path.join(path, 'results/steps/reg', _set + '.out.postprocessed')
    with open(p) as f:
        y_pred_ = f.read().split('\n')[:-1]

    y_real, y_pred, y_baseline = [], [], []
    for i, g in enumerate(gold):
        y_real.append(' '.join(g['refex']).strip().lower())
        y_pred.append(y_pred_[i].strip().lower())
    
    num, dem = 0.0, 0
    baseline = 0
    for i, y_ in enumerate(y_pred):
        if gold[i]['category'] in unseen_domains:
            refex = ' '.join(nltk.word_tokenize(gold[i]['entity'].replace('\'', ' ').replace('\"', ' ').replace('_', ' ')))
            y = y_real[i]
            if y_.strip() == y:
                num += 1
            if refex.strip().lower() == y:
                baseline += 1
            dem += 1
    print('REG Task:')
    print('Set: ', _set)
    print('Baseline Accuracy: ', round(baseline/dem, 2) if dem > 0 else 0)
    print('NeuralREG Accuracy: ', round(num/dem, 2) if dem > 0 else 0)
    print(10 * '-')

Unseen domains:
REG Task:
Set:  dev
Baseline Accuracy:  0
NeuralREG Accuracy:  0
----------
REG Task:
Set:  test
Baseline Accuracy:  0.5
NeuralREG Accuracy:  0.07
----------


# Evaluation of Lexicalization

In [8]:
print('All domains:')
for _set in ['dev', 'test']:
    for model in ['random', 'major', 'transformer', 'rnn']:
        gold_path=os.path.join(path, 'data', 'lexicalization', _set + '.json')
        gold = json.load(open(gold_path))

        p = os.path.join(path, 'results/steps/lexicalization', model, _set + '.out.postprocessed')
        with open(p) as f:
            y_pred_ = f.read().split('\n')[:-1]

        y_real, y_pred = [], []
        for i, g in enumerate(gold):
        #     if g['category'] in unseen_domains:
            t = [' '.join(target['output']).lower() for target in g['targets']]
            y_real.append(t)
            y_pred.append(y_pred_[i].strip().lower())

        with open('predictions', 'w') as f:
            f.write('\n'.join(y_pred))

        nfiles = max([len(refs) for refs in y_real])
        for i in range(nfiles):
            with open('reference' + str(i+1), 'w') as f:
                for refs in y_real:
                    if i < len(refs):
                        f.write(refs[i])
                    f.write('\n')

        nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
        command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
        result = subprocess.check_output(command, shell=True)
        print('Lexicalization: ')
        print('Set: ', _set)
        print('Model: ', model)
        print(result)
        print(10 * '-')

        os.remove('reference1')
        os.remove('reference2')
        os.remove('reference3')
        os.remove('reference4')
        os.remove('reference5')
        os.remove('reference6')
        os.remove('reference7')
        os.remove('reference8')
        os.remove('predictions')

All domains:
Lexicalization: 
Set:  dev
Model:  random
b'BLEU = 40.06, 72.4/46.1/32.0/24.1 (BP=1.000, ratio=1.085, hyp_len=22790, ref_len=21003)\n'
----------
Lexicalization: 
Set:  dev
Model:  major
b'BLEU = 43.26, 74.8/49.1/35.3/27.0 (BP=1.000, ratio=1.071, hyp_len=22286, ref_len=20813)\n'
----------
Lexicalization: 
Set:  dev
Model:  transformer
b'BLEU = 48.69, 76.0/54.2/41.3/33.0 (BP=1.000, ratio=1.018, hyp_len=34167, ref_len=33579)\n'
----------
Lexicalization: 
Set:  dev
Model:  rnn
b'BLEU = 49.71, 77.0/55.1/42.3/34.1 (BP=1.000, ratio=1.006, hyp_len=33998, ref_len=33803)\n'
----------
Lexicalization: 
Set:  test
Model:  random
b'BLEU = 39.49, 72.6/45.5/31.4/23.4 (BP=1.000, ratio=1.073, hyp_len=29090, ref_len=27122)\n'
----------
Lexicalization: 
Set:  test
Model:  major
b'BLEU = 44.82, 76.2/50.3/36.9/28.5 (BP=1.000, ratio=1.069, hyp_len=27956, ref_len=26163)\n'
----------
Lexicalization: 
Set:  test
Model:  transformer
b'BLEU = 38.12, 72.8/46.2/31.6/23.7 (BP=0.956, ratio=0.957, h

In [9]:
print('Seen domains:')
for _set in ['dev', 'test']:
    for model in ['random', 'major', 'transformer', 'rnn']:
        gold_path=os.path.join(path, 'data', 'lexicalization', _set + '.json')
        gold = json.load(open(gold_path))

        p = os.path.join(path, 'results/steps/lexicalization', model, _set + '.out.postprocessed')
        with open(p) as f:
            y_pred_ = f.read().split('\n')[:-1]

        y_real, y_pred = [], []
        for i, g in enumerate(gold):
            if g['category'] not in unseen_domains:
                t = [' '.join(target['output']).lower() for target in g['targets']]
                y_real.append(t)
                y_pred.append(y_pred_[i].strip().lower())

        with open('predictions', 'w') as f:
            f.write('\n'.join(y_pred))

        nfiles = max([len(refs) for refs in y_real])
        for i in range(nfiles):
            with open('reference' + str(i+1), 'w') as f:
                for refs in y_real:
                    if i < len(refs):
                        f.write(refs[i])
                    f.write('\n')

        nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
        command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
        result = subprocess.check_output(command, shell=True)
        print('Lexicalization: ')
        print('Set: ', _set)
        print('Model: ', model)
        print(result)
        print(10 * '-')

        os.remove('predictions')
        os.remove('reference1')
        os.remove('reference2')
        os.remove('reference3')
        os.remove('reference4')
        os.remove('reference5')
        os.remove('reference6')
        os.remove('reference7')
        os.remove('reference8')

Seen domains:
Lexicalization: 
Set:  dev
Model:  random
b'BLEU = 40.06, 72.4/46.1/32.0/24.1 (BP=1.000, ratio=1.085, hyp_len=22790, ref_len=21003)\n'
----------
Lexicalization: 
Set:  dev
Model:  major
b'BLEU = 43.26, 74.8/49.1/35.3/27.0 (BP=1.000, ratio=1.071, hyp_len=22286, ref_len=20813)\n'
----------
Lexicalization: 
Set:  dev
Model:  transformer
b'BLEU = 48.69, 76.0/54.2/41.3/33.0 (BP=1.000, ratio=1.018, hyp_len=34167, ref_len=33579)\n'
----------
Lexicalization: 
Set:  dev
Model:  rnn
b'BLEU = 49.71, 77.0/55.1/42.3/34.1 (BP=1.000, ratio=1.006, hyp_len=33998, ref_len=33803)\n'
----------
Lexicalization: 
Set:  test
Model:  random
b'BLEU = 40.46, 73.1/46.6/32.4/24.2 (BP=1.000, ratio=1.040, hyp_len=24491, ref_len=23559)\n'
----------
Lexicalization: 
Set:  test
Model:  major
b'BLEU = 45.65, 76.6/51.2/37.8/29.3 (BP=1.000, ratio=1.033, hyp_len=23805, ref_len=23048)\n'
----------
Lexicalization: 
Set:  test
Model:  transformer
b'BLEU = 48.14, 77.5/54.6/41.4/32.6 (BP=0.985, ratio=0.985, 

In [10]:
print('Unseen domains:')
for _set in ['dev', 'test']:
    for model in ['random', 'major', 'transformer', 'rnn']:
        gold_path=os.path.join(path, 'data', 'lexicalization', _set + '.json')
        gold = json.load(open(gold_path))

        p = os.path.join(path, 'results/steps/lexicalization', model, _set + '.out.postprocessed')
        with open(p) as f:
            y_pred_ = f.read().split('\n')[:-1]

        y_real, y_pred = [], []
        for i, g in enumerate(gold):
            if g['category'] in unseen_domains:
                t = [' '.join(target['output']).lower() for target in g['targets']]
                y_real.append(t)
                y_pred.append(y_pred_[i].strip().lower())

        with open('predictions', 'w') as f:
            f.write('\n'.join(y_pred))

        try:
            nfiles = max([len(refs) for refs in y_real])
            for i in range(nfiles):
                with open('reference' + str(i+1), 'w') as f:
                    for refs in y_real:
                        if i < len(refs):
                            f.write(refs[i])
                        f.write('\n')

            nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
            command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
            result = subprocess.check_output(command, shell=True)
            print('Lexicalization: ')
            print('Set: ', _set)
            print('Model: ', model)
            print(result)
            print(10 * '-')

            os.remove('predictions')
            os.remove('reference1')
            os.remove('reference2')
            os.remove('reference3')
            os.remove('reference4')
            os.remove('reference5')
            os.remove('reference6')
            os.remove('reference7')
            os.remove('reference8')
        except:
            pass

Unseen domains:
Lexicalization: 
Set:  test
Model:  random
b'BLEU = 33.79, 69.8/39.8/25.5/18.4 (BP=1.000, ratio=1.288, hyp_len=4599, ref_len=3572)\n'
----------
Lexicalization: 
Set:  test
Model:  major
b'BLEU = 39.43, 73.9/44.9/31.4/23.2 (BP=1.000, ratio=1.329, hyp_len=4151, ref_len=3124)\n'
----------
Lexicalization: 
Set:  test
Model:  transformer
b'BLEU = 24.15, 66.4/34.7/18.2/11.4 (BP=0.919, ratio=0.922, hyp_len=28892, ref_len=31323)\n'
----------
Lexicalization: 
Set:  test
Model:  rnn
b'BLEU = 23.63, 57.0/31.1/16.8/10.5 (BP=1.000, ratio=1.088, hyp_len=36678, ref_len=33701)\n'
----------


# Evaluation of Final Texts (BLEU)

In [11]:
print('All domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]

                y_real, y_pred = [], []
                for i, g in enumerate(gold):
                #     if g['category'] in unseen_domains:
                    targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                    t = [' '.join(target).lower() for target in targets]
                    y_real.append(t)
                    pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                    y_pred.append(pred)

                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                nfiles = max([len(refs) for refs in y_real])
                for i in range(nfiles):
                    with open('reference' + str(i+1), 'w') as f:
                        for refs in y_real:
                            if i < len(refs):
                                f.write(refs[i])
                            f.write('\n')

                nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
                command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
                result = subprocess.check_output(command, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result)
                print(10 * '-')

                os.remove('reference1')
                os.remove('reference2')
                os.remove('reference3')
                os.remove('reference4')
                os.remove('reference5')
                os.remove('reference6')
                os.remove('reference7')
                os.remove('reference8')
                os.remove('predictions')

All domains:
Final: 
Set:  dev
Approach: pipeline
Model:  rand
b'BLEU = 41.01, 75.2/50.1/33.5/22.4 (BP=1.000, ratio=1.132, hyp_len=13392, ref_len=11826)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  major
b'BLEU = 46.76, 79.1/55.9/39.3/27.5 (BP=1.000, ratio=1.039, hyp_len=20297, ref_len=19539)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  transformer
b'BLEU = 57.85, 87.5/67.2/51.2/39.2 (BP=0.987, ratio=0.987, hyp_len=18855, ref_len=19109)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  rnn
b'BLEU = 58.69, 87.1/67.1/51.6/39.8 (BP=0.997, ratio=0.997, hyp_len=19403, ref_len=19464)\n'
----------
Final: 
Set:  dev
Approach: end2end
Model:  transformer
b'BLEU = 55.02, 82.7/62.4/47.8/37.1 (BP=1.000, ratio=1.008, hyp_len=19494, ref_len=19343)\n'
----------
Final: 
Set:  dev
Approach: end2end
Model:  rnn
b'BLEU = 60.19, 85.4/67.6/53.5/42.6 (BP=1.000, ratio=1.000, hyp_len=19389, ref_len=19398)\n'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rand
b

In [12]:
print('Seen domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]

                y_real, y_pred = [], []
                for i, g in enumerate(gold):
                    if g['category'] not in unseen_domains:
                        targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                        t = [' '.join(target).lower() for target in targets]
                        y_real.append(t)
                        pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                        y_pred.append(pred)

                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                nfiles = max([len(refs) for refs in y_real])
                for i in range(nfiles):
                    with open('reference' + str(i+1), 'w') as f:
                        for refs in y_real:
                            if i < len(refs):
                                f.write(refs[i])
                            f.write('\n')

                nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
                command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
                result = subprocess.check_output(command, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result)
                print(10 * '-')

                os.remove('reference1')
                os.remove('reference2')
                os.remove('reference3')
                os.remove('reference4')
                os.remove('reference5')
                os.remove('reference6')
                os.remove('reference7')
                os.remove('reference8')
                os.remove('predictions')

Seen domains:
Final: 
Set:  dev
Approach: pipeline
Model:  rand
b'BLEU = 41.01, 75.2/50.1/33.5/22.4 (BP=1.000, ratio=1.132, hyp_len=13392, ref_len=11826)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  major
b'BLEU = 46.76, 79.1/55.9/39.3/27.5 (BP=1.000, ratio=1.039, hyp_len=20297, ref_len=19539)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  transformer
b'BLEU = 57.85, 87.5/67.2/51.2/39.2 (BP=0.987, ratio=0.987, hyp_len=18855, ref_len=19109)\n'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  rnn
b'BLEU = 58.69, 87.1/67.1/51.6/39.8 (BP=0.997, ratio=0.997, hyp_len=19403, ref_len=19464)\n'
----------
Final: 
Set:  dev
Approach: end2end
Model:  transformer
b'BLEU = 55.02, 82.7/62.4/47.8/37.1 (BP=1.000, ratio=1.008, hyp_len=19494, ref_len=19343)\n'
----------
Final: 
Set:  dev
Approach: end2end
Model:  rnn
b'BLEU = 60.19, 85.4/67.6/53.5/42.6 (BP=1.000, ratio=1.000, hyp_len=19389, ref_len=19398)\n'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rand


In [13]:
print('Unseen domains:')
for _set in ['test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]

                y_real, y_pred = [], []
                for i, g in enumerate(gold):
                    if g['category'] in unseen_domains:
                        targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                        t = [' '.join(target).lower() for target in targets]
                        y_real.append(t)
                        pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                        y_pred.append(pred)

                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                nfiles = max([len(refs) for refs in y_real])
                for i in range(nfiles):
                    with open('reference' + str(i+1), 'w') as f:
                        for refs in y_real:
                            if i < len(refs):
                                f.write(refs[i])
                            f.write('\n')

                nematus = '/roaming/tcastrof/workspace/nematus/data/multi-bleu.perl'
                command = 'perl ' + nematus + ' reference1 reference2 reference3 reference4 reference5 reference6 reference7 reference8 < predictions'
                result = subprocess.check_output(command, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result)
                print(10 * '-')

                try:
                    os.remove('predictions')
                    os.remove('reference1')
                    os.remove('reference2')
                    os.remove('reference3')
                    os.remove('reference4')
                    os.remove('reference5')
                    os.remove('reference6')
                    os.remove('reference7')
                    os.remove('reference8')
                except:
                    pass

Unseen domains:
Final: 
Set:  test
Approach: pipeline
Model:  rand
b'BLEU = 41.51, 78.6/51.6/33.9/21.6 (BP=1.000, ratio=1.182, hyp_len=4720, ref_len=3994)\n'
----------
Final: 
Set:  test
Approach: pipeline
Model:  major
b'BLEU = 41.13, 77.3/51.4/33.7/21.4 (BP=1.000, ratio=1.075, hyp_len=9630, ref_len=8956)\n'
----------
Final: 
Set:  test
Approach: pipeline
Model:  transformer
b'BLEU = 38.92, 75.3/46.3/31.5/20.9 (BP=1.000, ratio=1.114, hyp_len=9957, ref_len=8940)\n'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rnn
b'BLEU = 38.55, 71.9/45.5/31.3/21.6 (BP=1.000, ratio=1.098, hyp_len=11034, ref_len=10051)\n'
----------
Final: 
Set:  test
Approach: end2end
Model:  transformer
b'BLEU = 5.88, 36.3/9.3/3.6/1.2 (BP=0.953, ratio=0.954, hyp_len=17139, ref_len=17957)\n'
----------
Final: 
Set:  test
Approach: end2end
Model:  rnn
b'BLEU = 6.24, 31.3/8.7/3.6/1.5 (BP=1.000, ratio=1.097, hyp_len=20907, ref_len=19066)\n'
----------


# Evaluation of Final Texts (METEOR)

In [14]:
print('All domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]
                    
                y_real, y_pred = [], []
                for i, g in enumerate(gold):
#                     if g['category'] in unseen_domains:
                    targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                    t = [' '.join(target).lower() for target in targets]
                    y_real.append(t)
                    pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                    y_pred.append(pred)


                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                with open('reference', 'w') as f:
                    for refs in y_real:
                        for i in range(8):
                            if i < len(refs):
                                f.write(refs[i])
                            else:
                                f.write('')
                            f.write('\n')

                java = '/roaming/tcastrof/workspace/java/jre1.8.0_181/bin/java -Xmx2G -jar '
                java += '/home/tcastrof/workspace/meteor-1.5/meteor-1.5.jar predictions reference -l en -norm -r 8'
                result = subprocess.check_output(java, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result.split(b'\n')[-2])
                print(10 * '-')

                os.remove('reference')
                os.remove('predictions')

All domains:
Final: 
Set:  dev
Approach: pipeline
Model:  rand
b''
----------
Final: 
Set:  dev
Approach: pipeline
Model:  major
b'Final score:            0.41077562572144327'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  transformer
b'Final score:            0.4330416533942173'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  rnn
b'Final score:            0.44177425982219903'
----------
Final: 
Set:  dev
Approach: end2end
Model:  transformer
b'Final score:            0.4134470330967474'
----------
Final: 
Set:  dev
Approach: end2end
Model:  rnn
b'Final score:            0.4324570419023144'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rand
b'Final score:            0.20375595787202236'
----------
Final: 
Set:  test
Approach: pipeline
Model:  major
b'Final score:            0.3279519530149407'
----------
Final: 
Set:  test
Approach: pipeline
Model:  transformer
b'Final score:            0.3230984032371677'
----------
Final: 
Set:  test
Approach: pipeline
Mod

In [15]:
print('Seen domains:')
for _set in ['dev', 'test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]
                    
                y_real, y_pred = [], []
                for i, g in enumerate(gold):
                    if g['category'] not in unseen_domains:
                        targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                        t = [' '.join(target).lower() for target in targets]
                        y_real.append(t)
                        pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                        y_pred.append(pred)


                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                with open('reference', 'w') as f:
                    for refs in y_real:
                        for i in range(8):
                            if i < len(refs):
                                f.write(refs[i])
                            else:
                                f.write('')
                            f.write('\n')

                java = '/roaming/tcastrof/workspace/java/jre1.8.0_181/bin/java -Xmx2G -jar '
                java += '/home/tcastrof/workspace/meteor-1.5/meteor-1.5.jar predictions reference -l en -norm -r 8'
                result = subprocess.check_output(java, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result.split(b'\n')[-2])
                print(10 * '-')

                os.remove('reference')
                os.remove('predictions')

Seen domains:
Final: 
Set:  dev
Approach: pipeline
Model:  rand
b''
----------
Final: 
Set:  dev
Approach: pipeline
Model:  major
b'Final score:            0.41077562572144327'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  transformer
b'Final score:            0.4330416533942173'
----------
Final: 
Set:  dev
Approach: pipeline
Model:  rnn
b'Final score:            0.44177425982219903'
----------
Final: 
Set:  dev
Approach: end2end
Model:  transformer
b'Final score:            0.4134470330967474'
----------
Final: 
Set:  dev
Approach: end2end
Model:  rnn
b'Final score:            0.4324570419023144'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rand
b'Final score:            0.27316371692277963'
----------
Final: 
Set:  test
Approach: pipeline
Model:  major
b'Final score:            0.4122845738480068'
----------
Final: 
Set:  test
Approach: pipeline
Model:  transformer
b'Final score:            0.4144337367285661'
----------
Final: 
Set:  test
Approach: pipeline
Mo

In [16]:
print('Unseen domains:')
for _set in ['test']:
    gold_path=os.path.join(path, 'data', 'end2end', _set + '.json')
    gold = json.load(open(gold_path))
    for kind in ['pipeline', 'end2end']:
        for model in ['rand', 'major', 'transformer', 'rnn']:
            if kind == 'end2end' and model in ['rand', 'major']:
                continue
            else:
                p = os.path.join(path, 'results', kind, model, _set + '.out.postprocessed')
                with open(p) as f:
                    y_pred_ = f.read().split('\n')[:-1]
                    
                y_real, y_pred = [], []
                for i, g in enumerate(gold):
                    if g['category'] in unseen_domains:
                        targets = [nltk.word_tokenize(' '.join(target['output'])) for target in g['targets']]
                        t = [' '.join(target).lower() for target in targets]
                        y_real.append(t)
                        pred = ' '.join(nltk.word_tokenize(y_pred_[i])).lower()
                        y_pred.append(pred)


                with open('predictions', 'w') as f:
                    f.write('\n'.join(y_pred))

                with open('reference', 'w') as f:
                    for refs in y_real:
                        for i in range(8):
                            if i < len(refs):
                                f.write(refs[i])
                            else:
                                f.write('')
                            f.write('\n')

                java = '/roaming/tcastrof/workspace/java/jre1.8.0_181/bin/java -Xmx2G -jar '
                java += '/home/tcastrof/workspace/meteor-1.5/meteor-1.5.jar predictions reference -l en -norm -r 8'
                result = subprocess.check_output(java, shell=True)
                print('Final: ')
                print('Set: ', _set)
                print('Approach:', kind)
                print('Model: ', model)
                print(result.split(b'\n')[-2])
                print(10 * '-')

                os.remove('reference')
                os.remove('predictions')

Unseen domains:
Final: 
Set:  test
Approach: pipeline
Model:  rand
b''
----------
Final: 
Set:  test
Approach: pipeline
Model:  major
b'Final score:            0.2193301666864556'
----------
Final: 
Set:  test
Approach: pipeline
Model:  transformer
b'Final score:            0.20948928354092697'
----------
Final: 
Set:  test
Approach: pipeline
Model:  rnn
b'Final score:            0.21778297038185643'
----------
Final: 
Set:  test
Approach: end2end
Model:  transformer
b'Final score:            0.08846673176674488'
----------
Final: 
Set:  test
Approach: end2end
Model:  rnn
b'Final score:            0.08872436586936241'
----------


# Evaluation of Final Texts (Fluency and Semantic)

In [17]:
import numpy as np
import scipy.stats


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h, m-h, m+h

path = 'evaluation/human/grades.json'
grades = json.load(open(path))

path = 'evaluation/human/participants.json'
participants = json.load(open(path))

In [18]:
from collections import Counter
grades = json.load(open('evaluation/human/ngrades.json'))
participant_ids = set([w['participant_id'] for w in grades])
print('Number of participants: ', len(participant_ids))

participants = [p for p in participants if p['id'] in participant_ids]

print('***Gender:***')
print(Counter([p['gender'] for p in participants]))
print('***English Proficiency Level:***')
print(Counter([p['english_proficiency_level'] for p in participants]))
print('***Age:***')
print(round(np.mean([int(p['age']) for p in participants]), 2))
print('\n')
    
print('All Domains')
models = set([g['model'] for g in grades])
print('Fluency: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    fluency = [float(g['fluency']) for g in grades if g['model'] == model]# and g['category'] in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(fluency), 2), round(mean_confidence_interval(fluency)[1], 2)))
print('\n')
print('Semantics: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    semantic = [float(g['semantic']) for g in grades if g['model'] == model]# and g['category'] in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(semantic), 2), round(mean_confidence_interval(semantic)[1], 2)))

Number of participants:  35
***Gender:***
Counter({'M': 21, 'F': 14})
***English Proficiency Level:***
Counter({'native': 18, 'fluent': 17})
***Age:***
32.29


All Domains
Fluency: 
rand: 4.55 +-0.27
major: 5.0 +-0.23
rnn: 5.31 +-0.23
transformer: 5.03 +-0.25
e2ernn: 4.73 +-0.24
e2etransformer: 5.02 +-0.25
melbourne: 5.04 +-0.22
upfforge: 5.46 +-0.19
original: 5.76 +-0.17


Semantics: 
rand: 4.44 +-0.29
major: 5.02 +-0.24
rnn: 5.21 +-0.23
transformer: 4.87 +-0.27
e2ernn: 4.47 +-0.26
e2etransformer: 4.7 +-0.27
melbourne: 4.94 +-0.24
upfforge: 5.31 +-0.21
original: 5.74 +-0.18


In [19]:
print('Seen Domains')
models = set([g['model'] for g in grades])
print('Fluency: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    fluency = [float(g['fluency']) for g in grades if g['model'] == model and g['category'] not in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(fluency), 2), round(mean_confidence_interval(fluency)[1], 2)))
print('\n')
print('Semantics: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    semantic = [float(g['semantic']) for g in grades if g['model'] == model and g['category'] not in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(semantic), 2), round(mean_confidence_interval(semantic)[1], 2)))

Seen Domains
Fluency: 
rand: 4.79 +-0.31
major: 5.25 +-0.24
rnn: 5.51 +-0.25
transformer: 5.53 +-0.24
e2ernn: 5.4 +-0.23
e2etransformer: 5.38 +-0.26
melbourne: 5.23 +-0.27
upfforge: 5.43 +-0.22
original: 5.82 +-0.2


Semantics: 
rand: 4.73 +-0.34
major: 5.41 +-0.24
rnn: 5.48 +-0.25
transformer: 5.49 +-0.27
e2ernn: 5.21 +-0.25
e2etransformer: 5.15 +-0.29
melbourne: 5.33 +-0.28
upfforge: 5.35 +-0.25
original: 5.8 +-0.2


In [20]:
print('Unseen Domains')
models = set([g['model'] for g in grades])
print('Fluency: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    fluency = [float(g['fluency']) for g in grades if g['model'] == model and g['category'] in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(fluency), 2), round(mean_confidence_interval(fluency)[1], 2)))
print('\n')
print('Semantics: ')
for model in ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']:
    semantic = [float(g['semantic']) for g in grades if g['model'] == model and g['category'] in unseen_domains]
    print('{0}: {1} +-{2}'.format(model, round(np.mean(semantic), 2), round(mean_confidence_interval(semantic)[1], 2)))

Unseen Domains
Fluency: 
rand: 4.07 +-0.53
major: 4.49 +-0.48
rnn: 4.91 +-0.48
transformer: 4.05 +-0.5
e2ernn: 3.45 +-0.45
e2etransformer: 4.32 +-0.5
melbourne: 4.65 +-0.39
upfforge: 5.51 +-0.35
original: 5.63 +-0.32


Semantics: 
rand: 3.86 +-0.54
major: 4.25 +-0.49
rnn: 4.67 +-0.47
transformer: 3.64 +-0.49
e2ernn: 3.03 +-0.44
e2etransformer: 3.81 +-0.54
melbourne: 4.15 +-0.43
upfforge: 5.24 +-0.4
original: 5.63 +-0.34


In [21]:
from scipy.stats import mannwhitneyu, wilcoxon

models = ['rand', 'major', 'rnn', 'transformer', 'e2ernn', 'e2etransformer', 'melbourne', 'upfforge', 'original']
for i, model1 in enumerate(models):
    for j, model2 in enumerate(models):
        if model1 != model2:
            fluency1 = [float(g['semantic']) for g in grades if g['model'] == model1]# and g['category'] in unseen_domains]
            fluency2 = [float(g['semantic']) for g in grades if g['model'] == model2]# and g['category'] in unseen_domains]
            print(model1, 'x', model2, ':', round(mannwhitneyu(fluency1, fluency2)[1], 2) < 0.05)

rand x major : True
rand x rnn : True
rand x transformer : True
rand x e2ernn : False
rand x e2etransformer : False
rand x melbourne : True
rand x upfforge : True
rand x original : True
major x rand : True
major x rnn : False
major x transformer : False
major x e2ernn : True
major x e2etransformer : False
major x melbourne : False
major x upfforge : False
major x original : True
rnn x rand : True
rnn x major : False
rnn x transformer : False
rnn x e2ernn : True
rnn x e2etransformer : True
rnn x melbourne : False
rnn x upfforge : False
rnn x original : True
transformer x rand : True
transformer x major : False
transformer x rnn : False
transformer x e2ernn : True
transformer x e2etransformer : False
transformer x melbourne : False
transformer x upfforge : True
transformer x original : True
e2ernn x rand : False
e2ernn x major : True
e2ernn x rnn : True
e2ernn x transformer : True
e2ernn x e2etransformer : False
e2ernn x melbourne : True
e2ernn x upfforge : True
e2ernn x original : True


# Evaluation of Annotations

In [22]:
gold_path='evaluation/questionaire/trials/gold.json'
gold = json.load(open(gold_path))

path='evaluation/questionaire/annotations.json'
annotations = json.load(open(path))

In [23]:
for model in annotations:
    values = {
        'detmistake': 0,
        'fluency': [],
        'moreinformation': 0,
        'numpreds': 0,
        'referencemistake': 0,
        'semantics': [],
        'structurefollowed': 0,
        'verbmistake': 0
    }
    dem = 0
    for trial in annotations[model]:
        g = [g for g in gold if g['eid'] == trial][0]
        category = g['category']
        size = g['size']
#         if category in unseen_domains:
        if annotations[model][trial]['structurefollowed']:
            values['structurefollowed'] += 1
        if annotations[model][trial]['moreinformation']:
            values['moreinformation'] += 1
        if annotations[model][trial]['referencemistake']:
            values['referencemistake'] += 1
        if annotations[model][trial]['verbmistake']:
            values['verbmistake'] += 1
        if annotations[model][trial]['detmistake']:
            values['detmistake'] += 1
        if int(size) == int(annotations[model][trial]['numpreds']):
            values['numpreds'] += 1
        dem += 1
        values['fluency'].append(float(annotations[model][trial]['fluency']))
        values['semantics'].append(float(annotations[model][trial]['semantics']))
    values['structurefollowed'] /= dem
    values['moreinformation'] /= dem
    values['referencemistake'] /= dem
    values['verbmistake'] /= dem
    values['detmistake'] /= dem

    print('Model: ', model)
    print('Structured followed in {0} of the cases'.format(round(values['structurefollowed'], 2)))
    print('More information in {0} of the cases'.format(round(values['moreinformation'], 2)))
    print('Exact number of predicates in {0} out of {1} of the cases ({2})'.format(values['numpreds'], dem, round(values['numpreds']/dem, 2)))
    print('Reference mistakes in {0} of the cases'.format(round(values['referencemistake'], 2)))
    print('Verb mistakes in {0} of the cases'.format(round(values['verbmistake'], 2)))
    print('Determiner mistakes in {0} of the cases'.format(round(values['detmistake'], 2)))
    print('Fluency: {0}'.format(np.mean(values['fluency'])))
    print('Semantics: {0}'.format(np.mean(values['semantics'])))
    print(10 * '-')

Model:  models\model1.xml
Structured followed in 0.35 of the cases
More information in 0.53 of the cases
Exact number of predicates in 29 out of 75 of the cases (0.39)
Reference mistakes in 0.21 of the cases
Verb mistakes in 0.05 of the cases
Determiner mistakes in 0.03 of the cases
Fluency: 5.626666666666667
Semantics: 3.52
----------
Model:  models\model2.xml
Structured followed in 0.33 of the cases
More information in 0.41 of the cases
Exact number of predicates in 35 out of 75 of the cases (0.47)
Reference mistakes in 0.09 of the cases
Verb mistakes in 0.03 of the cases
Determiner mistakes in 0.0 of the cases
Fluency: 4.546666666666667
Semantics: 3.973333333333333
----------
Model:  models\model3.xml
Structured followed in 0.89 of the cases
More information in 0.09 of the cases
Exact number of predicates in 54 out of 75 of the cases (0.72)
Reference mistakes in 0.28 of the cases
Verb mistakes in 0.12 of the cases
Determiner mistakes in 0.09 of the cases
Fluency: 5.746666666666667
S

## Inter-annotator Agreement

In [24]:
import os
from nltk.metrics.agreement import AnnotationTask

def kappa(obs):
    t = AnnotationTask(obs)
    print("\nWeighted kappa as per NLTK:\t", t.weighted_kappa(),
          "\nRegular kappa as per NLTK:\t", t.kappa(),
          "\nKrippendorff alpha as per NLTK:\t", t.alpha(),
          "\n===========================================\n")

path='evaluation/questionaire/observations/'
if not os.path.exists(path):
    os.mkdir(path)
    
ann1_model1 = annotations['models\\model1.xml']
ann2_model1 = annotations['model1.xml']
ann1_model3 = annotations['models\\model3.xml']
ann2_model3 = annotations['model3.xml']

obs_fluency = []
for trial in ann1_model1:
    obs_fluency.append(('ann1', 'model1_'+trial, int(ann1_model1[trial]['fluency'])))
for trial in ann1_model3:
    obs_fluency.append(('ann1', 'model3_'+trial, int(ann1_model3[trial]['fluency'])))
for trial in ann2_model1:
    obs_fluency.append(('ann2', 'model1_'+trial, int(ann2_model1[trial]['fluency'])))
for trial in ann2_model3:
    obs_fluency.append(('ann2', 'model3_'+trial, int(ann2_model3[trial]['fluency'])))

print('Fluency:')
kappa(obs_fluency)

with open(os.path.join(path, 'fluency.csv'), 'w') as f:
    for i, o in enumerate(obs_fluency):
        obs_fluency[i] = list(obs_fluency[i])
        obs_fluency[i][2] = str(obs_fluency[i][2])
        obs_fluency[i] = ','.join(obs_fluency[i])
    f.write('\n'.join(obs_fluency))
########################################################################################################
obs_semantic = []
for trial in ann1_model1:
    obs_semantic.append(('ann1', 'model1_'+trial, int(ann1_model1[trial]['semantics'])))
for trial in ann1_model3:
    obs_semantic.append(('ann1', 'model3_'+trial, int(ann1_model3[trial]['semantics'])))
for trial in ann2_model1:
    obs_semantic.append(('ann2', 'model1_'+trial, int(ann2_model1[trial]['semantics'])))
for trial in ann2_model3:
    obs_semantic.append(('ann2', 'model3_'+trial, int(ann2_model3[trial]['semantics'])))

print('Semantic:')
kappa(obs_semantic)

with open(os.path.join(path, 'semantic.csv'), 'w') as f:
    for i, o in enumerate(obs_semantic):
        obs_semantic[i] = list(obs_semantic[i])
        obs_semantic[i][2] = str(obs_semantic[i][2])
        obs_semantic[i] = ','.join(obs_semantic[i])
    f.write('\n'.join(obs_semantic))
########################################################################################################
obs_preds = []
for trial in ann1_model1:
    obs_preds.append(('1', 'model1_'+trial, int(ann1_model1[trial]['numpreds'])))
for trial in ann1_model3:
    obs_preds.append(('1', 'model3_'+trial, int(ann1_model3[trial]['numpreds'])))
for trial in ann2_model1:
    obs_preds.append(('2', 'model1_'+trial, int(ann2_model1[trial]['numpreds'])))
for trial in ann2_model3:
    obs_preds.append(('2', 'model3_'+trial, int(ann2_model3[trial]['numpreds'])))

print('Predicates:')
kappa(obs_preds)

with open(os.path.join(path, 'predicates.csv'), 'w') as f:
    for i, o in enumerate(obs_preds):
        obs_preds[i] = list(obs_preds[i])
        obs_preds[i][2] = str(obs_preds[i][2])
        obs_preds[i] = ','.join(obs_preds[i])
    f.write('\n'.join(obs_preds))
########################################################################################################
obs = []
for trial in ann1_model1:
    obs.append(('1', 'model1_'+trial, 1 if ann1_model1[trial]['structurefollowed'] else 0))
for trial in ann1_model3:
    obs.append(('1', 'model3_'+trial, 1 if ann1_model3[trial]['structurefollowed'] else 0))
for trial in ann2_model1:
    obs.append(('2', 'model1_'+trial, 1 if ann2_model1[trial]['structurefollowed'] else 0))
for trial in ann2_model3:
    obs.append(('2', 'model3_'+trial, 1 if ann2_model3[trial]['structurefollowed'] else 0))

print('Structure Followed:')
kappa(obs)

with open(os.path.join(path, 'structfollowed.csv'), 'w') as f:
    for i, o in enumerate(obs):
        obs[i] = list(obs[i])
        obs[i][2] = str(obs[i][2])
        obs[i] = ','.join(obs[i])
    f.write('\n'.join(obs))
########################################################################################################
obs = []
for trial in ann1_model1:
    obs.append(('1', 'model1_'+trial, 1 if ann1_model1[trial]['moreinformation'] else 0))
for trial in ann1_model3:
    obs.append(('1', 'model3_'+trial, 1 if ann1_model3[trial]['moreinformation'] else 0))
for trial in ann2_model1:
    obs.append(('2', 'model1_'+trial, 1 if ann2_model1[trial]['moreinformation'] else 0))
for trial in ann2_model3:
    obs.append(('2', 'model3_'+trial, 1 if ann2_model3[trial]['moreinformation'] else 0))

print('Overgeneration:')
kappa(obs)

with open(os.path.join(path, 'overgeneration.csv'), 'w') as f:
    for i, o in enumerate(obs):
        obs[i] = list(obs[i])
        obs[i][2] = str(obs[i][2])
        obs[i] = ','.join(obs[i])
    f.write('\n'.join(obs))
########################################################################################################
obs = []
for trial in ann1_model1:
    obs.append(('1', 'model1_'+trial, 1 if ann1_model1[trial]['verbmistake'] else 0))
for trial in ann1_model3:
    obs.append(('1', 'model3_'+trial, 1 if ann1_model3[trial]['verbmistake'] else 0))
for trial in ann2_model1:
    obs.append(('2', 'model1_'+trial, 1 if ann2_model1[trial]['verbmistake'] else 0))
for trial in ann2_model3:
    obs.append(('2', 'model3_'+trial, 1 if ann2_model3[trial]['verbmistake'] else 0))

print('Verb mistakes:')
kappa(obs)

with open(os.path.join(path, 'verbmistakes.csv'), 'w') as f:
    for i, o in enumerate(obs):
        obs[i] = list(obs[i])
        obs[i][2] = str(obs[i][2])
        obs[i] = ','.join(obs[i])
    f.write('\n'.join(obs))
########################################################################################################
obs = []
for trial in ann1_model1:
    obs.append(('1', 'model1_'+trial, 1 if ann1_model1[trial]['detmistake'] else 0))
for trial in ann1_model3:
    obs.append(('1', 'model3_'+trial, 1 if ann1_model3[trial]['detmistake'] else 0))
for trial in ann2_model1:
    obs.append(('2', 'model1_'+trial, 1 if ann2_model1[trial]['detmistake'] else 0))
for trial in ann2_model3:
    obs.append(('2', 'model3_'+trial, 1 if ann2_model3[trial]['detmistake'] else 0))

print('Determiner mistakes:')
kappa(obs)

with open(os.path.join(path, 'detmistakes.csv'), 'w') as f:
    for i, o in enumerate(obs):
        obs[i] = list(obs[i])
        obs[i][2] = str(obs[i][2])
        obs[i] = ','.join(obs[i])
    f.write('\n'.join(obs))
########################################################################################################
obs = []
for trial in ann1_model1:
    obs.append(('1', 'model1_'+trial, 1 if ann1_model1[trial]['referencemistake'] else 0))
for trial in ann1_model3:
    obs.append(('1', 'model3_'+trial, 1 if ann1_model3[trial]['referencemistake'] else 0))
for trial in ann2_model1:
    obs.append(('2', 'model1_'+trial, 1 if ann2_model1[trial]['referencemistake'] else 0))
for trial in ann2_model3:
    obs.append(('2', 'model3_'+trial, 1 if ann2_model3[trial]['referencemistake'] else 0))

print('Reference mistakes:')
kappa(obs)

with open(os.path.join(path, 'refmistakes.csv'), 'w') as f:
    for i, o in enumerate(obs):
        obs[i] = list(obs[i])
        obs[i][2] = str(obs[i][2])
        obs[i] = ','.join(obs[i])
    f.write('\n'.join(obs))

Fluency:

Weighted kappa as per NLTK:	 0.261904761904762 
Regular kappa as per NLTK:	 0.2619047619047618 
Krippendorff alpha as per NLTK:	 0.2596645367412139 

Semantic:

Weighted kappa as per NLTK:	 0.4459338695263628 
Regular kappa as per NLTK:	 0.4459338695263628 
Krippendorff alpha as per NLTK:	 0.44588252876998946 

Predicates:

Weighted kappa as per NLTK:	 0.7042878265155249 
Regular kappa as per NLTK:	 0.7042878265155249 
Krippendorff alpha as per NLTK:	 0.7052170340955771 

Structure Followed:

Weighted kappa as per NLTK:	 0.7796143250688705 
Regular kappa as per NLTK:	 0.7796143250688705 
Krippendorff alpha as per NLTK:	 0.7797016025050654 

Overgeneration:

Weighted kappa as per NLTK:	 0.5279838165879973 
Regular kappa as per NLTK:	 0.5279838165879974 
Krippendorff alpha as per NLTK:	 0.5243181818181818 

Verb mistakes:

Weighted kappa as per NLTK:	 0.6102449888641426 
Regular kappa as per NLTK:	 0.6102449888641432 
Krippendorff alpha as per NLTK:	 0.6079790222888182 

Determ

In [25]:
def split_triples(text):
    triples, triple = [], []
    for w in text:
        if w not in ['<TRIPLE>', '</TRIPLE>']:
            triple.append(w)
        elif w == '</TRIPLE>':
            triples.append(triple)
            triple = []
    return triples

def ordering_analysis(ordering, gold):
    for i, entry in enumerate(gold):
        triples = split_triples(entry['source'])

        num, visited = 0, []
        for triple in triples:
            for j, predicate in enumerate(ordering[i]):
                if predicate == triple[1] and j not in visited:
                    num += 1
                    visited.append(j)
        # How many predicates in the modified tripleset are present in the result?
        entry['ordering'] = num
    return gold


def structing_analysis(structing, gold):
    for i, entry in enumerate(self.gold):
        triples = split_triples(entry['source'])

        num, visited = 0, []
        for triple in triples:
            for j, predicate in enumerate(structing[i]):
                if predicate == triple[1] and j not in visited:
                    num += 1
                    visited.append(j)
        # How many predicates in the modified tripleset are present in the result?
        entry['structing'] = num
    return gold

In [450]:
path = 'results/questionaire/partial.txt'
with open(path) as f:
    eids = f.read().split()
    
gold_path='evaluation/questionaire/gold.json'
gold = json.load(open(gold_path))

p = 'evaluation/results/pipeline/transformer/test.structing.postprocessed'
with open(p) as f:
    ordering = f.read().split('\n')[:-1]

pos, dem = 0, 0
for i, entry in enumerate(gold):
    if entry['eid'] in eids:
        category = entry['category']
#         if category in unseen_domains:
        triples = split_triples(entry['source'])
        num, visited = 0, []
        for triple in triples:
            for j, predicate in enumerate(ordering[i].split()):
                if predicate == triple[1] and j not in visited:
                    num += 1
                    visited.append(j)
                if len(visited) == len(triples):
                    break
        if len(triples) == len(visited):
            pos += 1
        dem += 1

print('Exact number of predicates in {0} out of {1} of the cases ({2})'.format(pos, dem, round(pos/dem, 2)))

Exact number of predicates in 52 out of 75 of the cases (0.69)
