# Investigate BAD-tag approaches

In [None]:
LANGUAGE_PAIR = "en-de"

In [None]:
import codecs
import json

def read_file(file_path):
    with codecs.open(file_path, 'r', 'utf-8') as fid:
        return [line.rstrip() for line in fid.readlines()]
   
def read_error_detail(file_path):
    with codecs.open(file_path, 'r', 'utf-8') as fid:
        return [json.loads(line.strip()) for line in fid.readlines()]

def red(string):
    return "\033[31m%s\033[0m" % string

def display(tokens, tags=None):
    """
    Same number of tags as tokens
    """
    nr_tokens = len(tokens)
    display = []
    for word_index in range(nr_tokens):
        if tags and tags[word_index] == 'BAD':
            display.append(red(tokens[word_index]))
        else:
            display.append(tokens[word_index])
    print " ".join(display)

def display_v001(tokens, tags):
    """
    Number of tags is twice the number of tokens, imply gaps
    """
    nr_tokens = len(tokens)
    display = []
    
    # Initial OK/BAD gap
    if tags[0] == 'BAD':
        display = [red('___')]
    else:
        display = []
    
    # Separate word and gap tags
    word_tags = tags[1:][::2]
    gap_tags = tags[1:][1::2]
    
    for word_index in range(nr_tokens):
        # Word tag
        if word_tags[word_index] == 'BAD':
            display.append(red(tokens[word_index]))
        else:
            display.append(tokens[word_index])
        # Gap tag        
        if gap_tags[word_index] == 'BAD':
            display.append(red('___'))
    print " ".join(display)

## WMT 2017 Data

In [None]:
wmt2017 = '/mnt/data/datasets/WMT2017/WMT2017/task2_%s_training/' % LANGUAGE_PAIR
tags_v001 = '/home/ramon/redefine_word_qe/DATA/temporal_files/task2_%s_training/' % LANGUAGE_PAIR
# Data
source_tokens = [x.split() for x in read_file("%s/train.src" % wmt2017)]
mt_tokens = [x.split() for x in read_file("%s/train.mt" % wmt2017)]
pe_tokens = [x.split() for x in read_file("%s/train.pe" % wmt2017)]
# Tags v0.0.1
# To genrate this data see redefine_word_qe repository
source_tags = [x.split() for x in read_file("%s/train.source_tags" % tags_v001)]
target_tags = [x.split() for x in read_file("%s/train.tags" % tags_v001)]
# Error detail
error_detail = read_error_detail("%s/train.json" % tags_v001)

### Check error types

In [None]:
# Get an example of this
[index for index, errors in enumerate(error_detail) if any([error['type'] == 'deletion (shift)' for error in errors])][2]

### Some interesting examples (for en_de train)
* `index = 1`: Normal case (substitution). Note that by casuality the source-PE alignments for that word are empty.
* `index = 2`: Deletions
* `index = 3`: BAD gap at the beginning of the sentence
* `index = 10`: Case where the word match heuristic fails (it is not a word realocation)
* `index = 16`: Case where the word match heuristic works "beziehen" and fails "das"

In [None]:
index = 1
error_detail[index]

In [None]:
print("Source/PE/MT")
display(source_tokens[index], source_tags[index])
display(pe_tokens[index])
display_v001(mt_tokens[index], target_tags[index])