In [79]:
import json
from nltk.translate import nist_score, bleu_score

helsinki_src = ['en', 'de', 'ru', 'lt', 'uk', 'fr']
yandex_src = ['en', 'de', 'ru', 'lt', 'uk', 'arm', 'be', 'fr']

In [59]:
print(nist_score.sentence_nist(['Hello how are you', 'Hey how are you'], 'Hi how are you')) # the higher the beter
print(bleu_score.sentence_bleu(['Hello how are you'], 'Hi how are you')) # from 0 to 1

3.210584896448878
0.6905103676445132


In [73]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
def get_tgt_langs(data):
    tgt_langs = list()
    for q in data['questions']:
        for l in q['question']:
            if l['language'] not in tgt_langs:
                tgt_langs.append(l['language'])
        break
    return tgt_langs

def get_representations(data, id_, lang):
    representations = list()
    for q in data['questions']:
        if q['id'] == id_:
            for l in q['question']:
                if l['language'] == lang:
                    representations.append(l['string'])
            break
    return representations

In [74]:
test = read_json("../data/qald_test_wikidata.json")

# Helsinki NLP

In [75]:
# todo src-tgt storage
bleu_dict = dict()
nist_dict = dict()

for lang in helsinki_src:
    translated_test = read_json(f"../data/translated/qald_test_wikidata-helsinki-{lang}.json")
    tgt_langs = get_tgt_langs(translated_test)
    
    bleu_dict[lang] = dict()
    nist_dict[lang] = dict()
    
    for tgt in tgt_langs:
        bleu_dict[lang][tgt] = list()
        nist_dict[lang][tgt] = list()
    
    for q in test['questions']:
        for tgt in tgt_langs:
            representations = get_representations(translated_test, q['id'], tgt)
            original = get_representations(test, q['id'], tgt)
            if len(original) and len(representations):
                bleu_dict[lang][tgt].append(bleu_score.sentence_bleu(original, representations[0]))
                nist_dict[lang][tgt].append(nist_score.sentence_nist(original, representations[0]))

In [76]:
for lang in helsinki_src:
    translated_test = read_json(f"../data/translated/qald_test_wikidata-helsinki-{lang}.json")
    tgt_langs = get_tgt_langs(translated_test)
    
    for tgt in tgt_langs:
        print('----------', lang, tgt, '--------------')
        if len(bleu_dict[lang][tgt]) > 0:
            print("BLEU:", sum(bleu_dict[lang][tgt])/len(bleu_dict[lang][tgt]))
            print("NIST:", sum(nist_dict[lang][tgt])/len(nist_dict[lang][tgt]))
        else:
            print("No intersections between orig and translated")

---------- en de --------------
BLEU: 0.813971880742722
NIST: 4.78632143874278
---------- en ru --------------
BLEU: 0.7787394170451227
NIST: 4.606125989365795
---------- en fr --------------
BLEU: 0.8147455169784221
NIST: 4.786865970304578
---------- de en --------------
BLEU: 0.801532976638642
NIST: 4.664033077409502
---------- de fr --------------
BLEU: 0.7146861165126847
NIST: 4.258634455650803
---------- ru en --------------
BLEU: 0.6785707153164158
NIST: 4.118782526959039
---------- ru fr --------------
BLEU: 0.649403808375033
NIST: 4.1641245068828034
---------- lt de --------------
BLEU: 0.7434033882375212
NIST: 4.500106619640833
---------- lt ru --------------
BLEU: 0.5406532060094749
NIST: 3.549840885790017
---------- lt fr --------------
No intersections between orig and translated
---------- uk en --------------
BLEU: 0.6648348759282237
NIST: 4.0733389585696855
---------- uk de --------------
BLEU: 0.6452960243937094
NIST: 4.138602630102774
---------- uk ru --------------
BL

# Yandex

In [80]:
# todo src-tgt storage
bleu_dict = dict()
nist_dict = dict()

for lang in yandex_src:
    translated_test = read_json(f"../data/translated/qald_test_wikidata-yandex-{lang}.json")
    tgt_langs = get_tgt_langs(translated_test)
    
    bleu_dict[lang] = dict()
    nist_dict[lang] = dict()
    
    for tgt in tgt_langs:
        bleu_dict[lang][tgt] = list()
        nist_dict[lang][tgt] = list()
    
    for q in test['questions']:
        for tgt in tgt_langs:
            representations = get_representations(translated_test, q['id'], tgt)
            original = get_representations(test, q['id'], tgt)
            if len(original) and len(representations):
                bleu_dict[lang][tgt].append(bleu_score.sentence_bleu(original, representations[0]))
                nist_dict[lang][tgt].append(nist_score.sentence_nist(original, representations[0]))

In [83]:
for lang in yandex_src:
    translated_test = read_json(f"../data/translated/qald_test_wikidata-yandex-{lang}.json")
    tgt_langs = get_tgt_langs(translated_test)
    
    for tgt in tgt_langs:
        print('----------', lang, tgt, '--------------')
        if len(bleu_dict[lang][tgt]) > 0:
            print("BLEU:", sum(bleu_dict[lang][tgt])/len(bleu_dict[lang][tgt]))
            print("NIST:", sum(nist_dict[lang][tgt])/len(nist_dict[lang][tgt]))
        else:
            print("No intersections between orig and translated")

---------- en de --------------
BLEU: 0.8141518802015169
NIST: 4.7828238355334145
---------- en ru --------------
BLEU: 0.8517597530260896
NIST: 4.979094654560852
---------- en fr --------------
BLEU: 0.8136758804648995
NIST: 4.772983002996348
---------- de en --------------
BLEU: 0.8108490755275085
NIST: 4.623708143180468
---------- de ru --------------
BLEU: 0.8088124514142527
NIST: 4.755850275800683
---------- de fr --------------
BLEU: 0.7446407866061823
NIST: 4.470021438181341
---------- ru en --------------
BLEU: 0.7387059349438695
NIST: 4.353732502717963
---------- ru de --------------
BLEU: 0.701347457356762
NIST: 4.317866517765324
---------- ru fr --------------
BLEU: 0.7465184850819773
NIST: 4.618002399910466
---------- lt en --------------
BLEU: 0.7602917944970393
NIST: 4.323505553992809
---------- lt de --------------
BLEU: 0.7568220874048477
NIST: 4.522628359401878
---------- lt ru --------------
BLEU: 0.4414537379875299
NIST: 3.2700853650068935
---------- lt fr ----------