# Investigate BAD-tag approaches

In [1]:
# Random sample of specific error types
import numpy as np

In [3]:
import codecs
import json

def read_file(file_path):
    with codecs.open(file_path, 'r', 'utf-8') as fid:
        return [line.rstrip() for line in fid.readlines()]
   
def read_error_detail(file_path):
    with codecs.open(file_path, 'r', 'utf-8') as fid:
        return [json.loads(line.strip()) for line in fid.readlines()]

def red(string):
    return "\033[31m%s\033[0m" % string

def display(tokens, tags=None):
    """
    Same number of tags as tokens
    """
    nr_tokens = len(tokens)
    display = []
    for word_index in range(nr_tokens):
        if tags and tags[word_index] == 'BAD':
            display.append(red(tokens[word_index]))
        else:
            display.append(tokens[word_index])
    print " ".join(display)

def display_v001(tokens, tags):
    """
    Number of tags is twice the number of tokens, imply gaps
    """
    nr_tokens = len(tokens)
    display = []
    
    # Initial OK/BAD gap
    if tags[0] == 'BAD':
        display = [red('___')]
    else:
        display = []
    
    # Separate word and gap tags
    word_tags = tags[1:][::2]
    gap_tags = tags[1:][1::2]
    
    for word_index in range(nr_tokens):
        # Word tag
        if word_tags[word_index] == 'BAD':
            display.append(red(tokens[word_index]))
        else:
            display.append(tokens[word_index])
        # Gap tag        
        if gap_tags[word_index] == 'BAD':
            display.append(red('___'))
    print " ".join(display)
    
    
def read_alignments(alignments_file):
    alignments = [] 
    with open(alignments_file, 'r') as fid:
        for line in fid.readlines():
            alignments.append([[int(y) for y in x.split('-')] for x in line.rstrip().split()])
    return alignments

## WMT 2018 Data

In [4]:
# de-en.smt en-cs.smt en-de.nmt en-de.smt en-lv.nmt en-lv.smt
language_engine = 'en-de.smt'
sset = 'train'
wmt2018 = '/home/ramon/redefine_word_qe/DATA/WMT2018/task2_%s_%s/' % (language_engine, sset)

In [5]:
# Data
source_tokens = [x.split() for x in read_file("%s/%s.src" % (wmt2018, sset))]
mt_tokens = [x.split() for x in read_file("%s/%s.mt" % (wmt2018, sset))]
pe_tokens = [x.split() for x in read_file("%s/%s.pe" % (wmt2018, sset))]
# Tags v0.0.1
# To generate this data see redefine_word_qe repository
source_tags = [x.split() for x in read_file("%s/%s.source_tags" % (wmt2018, sset))]
target_tags = [x.split() for x in read_file("%s/%s.tags" % (wmt2018, sset))]
# Details not available to participants
source_mt_alignments = read_alignments("%s/%s.src-mt.alignments" % (wmt2018, sset))
error_details = read_error_detail("%s/%s.json" % (wmt2018, sset))

In [6]:
from collections import Counter, defaultdict
indices_by_error = defaultdict(set)
for index, error_detail in enumerate(error_details):
    for error in error_detail:
        indices_by_error[error['type']].add(index)

### Check error types

In [9]:
#indices = list(indices_by_error[u'deletion'])
indices = range(len(mt_tokens))
index = indices[int(np.random.choice(len(indices), 1))]
print "Source / Post-Edited / MT (%s)" % index
for error in error_details[index]:
    print error
print "----------------------------------------"
display(source_tokens[index][:], source_tags[index][:])
display(pe_tokens[index][:])
display_v001(mt_tokens[index][:], target_tags[index][:])

Source / Post-Edited / MT (23312)
{u'target_position': 0, u'source_positions': None, u'type': u'insertion'}
{u'target_position': 1, u'source_positions': None, u'type': u'insertion'}
{u'target_position': 2, u'source_positions': [], u'type': u'substitution'}
{u'target_position': 4, u'source_positions': [2], u'type': u'substitution'}
{u'target_position': None, u'source_positions': [0], u'type': u'deletion', u'gap_position': 9}
{u'target_position': None, u'source_positions': [5], u'type': u'deletion', u'gap_position': 9}
{u'target_position': None, u'source_positions': [], u'type': u'deletion', u'gap_position': 9}
{u'target_position': None, u'source_positions': [], u'type': u'deletion', u'gap_position': 9}
{u'target_position': 10, u'source_positions': [7], u'type': u'substitution'}
----------------------------------------
[31mthen[0m , [31mOff[0m or Low [31mshould[0m produce [31mgood[0m results .
mit " Deaktiviert " oder " Niedrig " lassen sich in der Regel gute Ergebnisse erzielen 