# Preprocess Notebook

This notebook is dedicated to the preprocessing of the data. The input file is in the format of an .tmx file. The steps that are currently being done are:
1. Extract lines
2. Store surplus
3. Pair lines

## Imports
Below are all the libraries used for this notebook

In [3]:
import re
import pickle
import copy

## Cleaning the data
This function includes all cleaning steps of the data. It reads the tmx file, takes out the lines that interest the model, removes unnecessary tags, and pairs the lines as englist-dutch respective order. Afterwards, it writes the data in a pickle for further usage. A pickle reading function with an optional limiter has also been included.

In [1]:
def clean(input_location: str, output_location: str, chunk_size = 1048576):
    """ This function takes a file, reads it, separates it, and saves it
    to a document via pickling it. Chunk size means the number of characters
    which roughly converts to the amount of bytes. Default setting is equal to a
    Megabyte. """
    
    # Opening the filestreams
    input_file = open(input_location, "r")
    output_file = open(f"data/{output_location}", "wb")
    
    # Reading the first chunk
    chunk = input_file.read(chunk_size)
    
    waiting_single = [[], []]              # Variable to hold the line pairs
    extra = " "                            # Variable to hold extra strings
    
    while chunk:
        #print(chunk+extra+"\n\n")
        lines, extra = getLines(extra + chunk)
        #print(f"{lines}\n{extra}")
        if not lines:
            chunk = input_file.read(chunk_size)
            continue
        waiting_single, pairs = pair(waiting_single, lines)
        pickle.dump(pairs, output_file)
        chunk = input_file.read(chunk_size)
    input_file.close()
    output_file.close()
        
       

In [None]:
def pair(temp: list, lines: list) -> list:
    """ Takes two list, one with all the unpaired lines, and a list with possibly an
    unpaired single and a tag. """
    
    results = list()
    for tag, line in lines:
        if len(temp[0]) == 2 and f"{temp[1][0]}{temp[1][1]}" != "ennl":
            # If the pair is wrongly ordered
            print(temp)
            print(results)
            raise Exception("Line mismatch occured")
        elif len(temp[0]) == 2:
            # If the pair is correct
            results.append(copy.copy(temp[0]))
            temp = [[],[]]
        # The part which adds a line to be paired
        temp[0].append(line)
        temp[1].append(tag)
    return (temp, results)
  

In [None]:
def getLines(temp_str: str) -> (list, str):
    """ Takes a string, extracts the lines with actual quotes.
    Returns the lines and the string left after the last line. """
    
    lines = re.findall(r"<tuv.+?(?=</seg>)</seg>", temp_str)
    # If there is no lines to find, pass the string as a whole
    if not lines:
        return [[], temp_str]
    # If there is a line, this part extracts the text after
    # the last scanned line
    if lines[-1] != temp_str[-len(lines[-1]):]:
        remnants = ""
        for i in range(1, len(temp_str)-len(lines[-1])):
            if lines[-1] == temp_str[-(len(lines[-1])+i):-i]:
                break
            else:
                remnants += temp_str[-i]
    # Erases the parts we are not interested in
    lines = [(line[15:17], line[24:-6]) for line in lines]
    return (lines, remnants[::-1])


In [5]:
def readPickle(pickle_off: str, limiter = -1):
    """ Read the pickle either in a limited amount of chunks or all
    of them at once. """
    objects = []
    file_stream = (open(pickle_off, "rb"))
    while limiter != 0:
        try:
            objects.append(pickle.load(file_stream))
            limiter -= 1
        except EOFError:
            break
    return objects