In [85]:
import os
import pandas as pd
import string
import re

{'German:\n': 'SOURCE ', 'English:\n': 'TARGET '}

In [201]:
default_ip = '!?.'
default_st_tagging = {"German:\n": "SOURCE ", "English:\n": "SPLIT "}
default_rm_tagging = {"Roots in English: ": "SPLIT ", "Modifiers in English: ": "SPLIT "}

def clean_text(file_path, format="labeled", ignored_punctuation=default_ip, source_target_tagging=default_st_tagging, root_modifier_tagging=default_rm_tagging):
    """
    Assumes strict order of German than English
    Returns two lists, source and target each of which is made up of multiple sentences. (not a list of words). Should
    be better for hugging face interface.
    @param file_path: path to file holding text
    @param format: allows function to cover both types of file inputs. "labeled" returns source and target lists, while
    "unlabeled" returns source + root lists and modifiers list of tuples
    @param ignored_punctuation: list of what punctuation to leave in sentence
    @param source_target_tagging: dictionary for tagging what part of text is target and source
    @param root_modifier_tagging: dictionary for tagging what part of text is root and modifiers
    @return: depending on label either source + target lists or source + roots + modifier lists
    """
    # read file
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # initial cleaning

    pattern = '|'.join(sorted(re.escape(obj) for obj in source_target_tagging))
    tagged_text = re.sub(pattern, lambda m: source_target_tagging.get(m.group(0)), text, flags=re.IGNORECASE)
    if format=="unlabeled":
        pattern2 = '|'.join(sorted(re.escape(obj) for obj in root_modifier_tagging))
        tagged_text = re.sub(pattern2, lambda m: root_modifier_tagging.get(m.group(0)), tagged_text, flags=re.IGNORECASE)
        ignored_punctuation += '()'
        # ignored_punctuation = ignored_punctuation + '()'
    regex_cleaning = dict()
    regex_cleaning.update({'\n': ' '})
    regex_cleaning.update({p:'' for p in string.punctuation if p not in ignored_punctuation})
    clean_text = tagged_text.translate(str.maketrans(regex_cleaning))
    # clean_text = tagged_text
    action_items = clean_text.split("SOURCE")

    # reorganization
    if format == "labeled":
        source_list = list()
        target_list = list()

        for action_item in action_items[1:]:
            source_target_obj = action_item.split("SPLIT")
            source_target_obj = [st_text.strip() for st_text in source_target_obj]
            source_list.append(source_target_obj[0])
            target_list.append(source_target_obj[1])
        return source_list, target_list
    elif format == "unlabeled":
        source_list = list()
        root_list = list()
        modifier_list = list()
        for action_index, action_item in enumerate(action_items[1:]):
            source_root_modifier_obj = action_item.split("SPLIT")
            source_root_modifier_obj = [st_text.strip() for st_text in source_root_modifier_obj]
            source_list.append(source_root_modifier_obj[0].translate(str.maketrans({"(": "", ")":""})))
            root_list.append(source_root_modifier_obj[1].split(' '))
            modifier_tuple_list = list()
            modifier_tuples = source_root_modifier_obj[2].translate(str.maketrans({"(": "*", ")": "*"})).split("*")
            for tup_index in range(0, len(modifier_tuples)-1, 2):
                modifier_tuple_list.append(tuple(modifier_tuples[tup_index+1].split(' ')))
            modifier_list.append(modifier_tuple_list)
        return source_list, root_list, modifier_list
    else:
        raise("Error: no process completed")

In [214]:
def check_file_statistics(filepath):
    format = filepath.split('.')[-1]
    if format=="labeled":
        source_list, target_list = clean_text(filepath, format=format)
        interest_dict = {"source_length": list(), "target_length": list()}
        for s, t in zip(source_list, target_list):
            interest_dict["source_length"].append(len(s.split(' ')))
            interest_dict["target_length"].append(len(t.split(' ')))

    elif format=="unlabeled":
        source_list, root_list, modifier_list = clean_text(filepath, format=format)
        interest_dict = {"source_length": list(), "root_length": list(), "modifier_length": list()}
        for s, r, m in zip(source_list, root_list, modifier_list):
            interest_dict["source_length"].append(len(s.split(' ')))
            interest_dict["root_length"].append(len(r))
            interest_dict["modifier_length"].append(len(m))
    else:
        raise("Does not conform to either either labeled or unlabeled format")
    print(pd.DataFrame.from_dict(interest_dict).describe())




In [80]:
train_path = 'C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\train.labeled'
train_source_list, train_target_list = clean_text(train_path)

In [202]:
comp_path = 'C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\comp.unlabeled'
comp_source_list, comp_root_list, comp_modifiers_list = clean_text(comp_path, format="unlabeled")

In [210]:
check_file_statistics(train_path)

       source_length  target_length
count   10000.000000   10000.000000
mean       63.894300      62.265000
std        22.171416      21.010681
min         1.000000       1.000000
25%        49.000000      48.000000
50%        63.000000      61.000000
75%        78.000000      76.000000
max       187.000000     166.000000


In [216]:
check_file_statistics('C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\val.labeled')

       source_length  target_length
count    1000.000000    1000.000000
mean       64.684000      62.890000
std        22.068511      20.939206
min         1.000000       1.000000
25%        50.000000      50.000000
50%        63.000000      62.000000
75%        79.000000      76.000000
max       179.000000     169.000000


In [215]:
check_file_statistics(comp_path)

       source_length  root_length  modifier_length
count    2000.000000  2000.000000      2000.000000
mean       64.204500     2.855000         2.855500
std        23.265518     1.141327         1.141171
min         1.000000     1.000000         1.000000
25%        49.000000     2.000000         2.000000
50%        63.000000     3.000000         3.000000
75%        78.000000     3.000000         3.000000
max       262.000000    13.000000        13.000000


We conclude from here that most sentences are around 50-80 words long.
The maximum in the total data set is 262 in the comp file. Which is annoying because it is nearly 100 words longer than the longest in train/val.