# Machine Translation Text Preprocessing Pipeline
A preprocessing pipeline for the European Parliament Proceedings Parallel Corpus 2011 English-to-Portuguese dataset, functionalized to preprocess general machine translation datasets for model training.


In [1]:
import pickle
import re
import string
import unicodedata
from collections import Counter


In [2]:
# Utility functions.
def load_raw_text(file_name: str):
    """ Load text file and return as list of sentences.
    """
    with open(file_name, 'r', encoding='utf-8') as f:
        text = f.read()

    sentences = text.strip().split('\n')
    
    return sentences

def load_clean_text(file_name: str):
    """ Load preprocessed text into memory.
    """
    with open(file_name, 'r') as f:
        text = pickle.load(f)
    
    return text

def save_clean_text(sentences: list, file_name: str):
    """ Save subset of sentences to file.
    """
    with open(file_name, 'wb') as f:
        pickle.dump(sentences, f)

def preprocess_lines(lines: list):
    """ Clean text line by line.
    Remove non-printable characters and numbers, return cleaned strings.
    """
    cleaned_lines = []
    
    # Regex printable character filter.
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    # Translation table that removes punctuation.
    table = str.maketrans('', '', string.punctuation)
    
    for line in lines:
        line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore') # Normalize.
        line = line.decode('UTF-8')
        line = line.split() # Split on whitespace.
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line] # Remove punctuation.
        line = [re_print.sub('', word) for word in line] # Remove non-printable characters.
        line = [word for word in line if word.isalpha()] # Remove non-alphabetic characters (numbers).
        
        cleaned_lines.append(' '.join(line))
        
    return cleaned_lines

def to_vocab(lines: list):
    """ Create vocabulary from list of tokens, return as a Counter.
    """
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
        
    return vocab

def trim_vocab(vocab: Counter, min_occurence: int):
    """ Trims vocabulary with a given threshold and returns as a set.
    Minimumn occurence is inclusive.
    """
    tokens = [t for t, count_ in vocab.items() if count_ >= min_occurence]
    
    return set(tokens)

def encode_oov(lines: list, vocab: set, fill_token: str='unk'):
    """ Replace out of vocabulary words with given token.
    """
    new_lines = []
    for line in lines:
        new_tokens = []
        for token in line.split():
            if token in vocab:
                new_tokens.append(token) # Do nothing for tokens in vocab.
            else:
                new_tokens.append(fill_token) # Replace unknown tokens with 'fill_token'.
                
        new_lines.append(' '.join(new_tokens))
        
    return new_lines



In [3]:
# Main data preprocessing function.
def clean_and_save_file(file_name: str, save_name: str):
    """ Load raw text file, preprocess, and save to desired location.
    """
    sentences = load_raw_text(file_name)
    lines = preprocess_lines(sentences)
    
    full_vocab = to_vocab(lines)
    vocab = trim_vocab(full_vocab, 5)

    encoded_lines = encode_oov(lines, vocab)
    save_clean_text(encoded_lines, save_name)
    
    return encoded_lines


In [4]:
# Preprocess and re-save data.
english = clean_and_save_file('data/europarl-v7.pt-en.en', 'data/english.pkl')
portuguese = clean_and_save_file('data/europarl-v7.pt-en.pt', 'data/portuguese.pkl')

# english = load_clean_sentences('data/english.pkl')
# portuguese = load_clean_sentences('data/portuguese.pkl')


In [5]:
# Compare sentences.
for i in range(3):
    print(f'Sentence in Portuguese: {portuguese[i]}')
    print(f'Sentence in English: {english[i]}\n')


Sentence in Portuguese: reinicio da sessao
Sentence in English: resumption of the session

Sentence in Portuguese: declaro reaberta a sessao do parlamento europeu que tinha sido interrompida na sextafeira de dezembro ultimo e renovo todos os meus votos esperando que tenham tido boas ferias
Sentence in English: i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period

Sentence in Portuguese: como puderam constatar o grande bug do ano nao aconteceu em contrapartida os cidadaos de alguns dos nossos paises foram vitimas de catastrofes naturais verdadeiramente terriveis
Sentence in English: although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful

