In [3]:
import os
import pandas as pd
import string
import re
from datasets import Dataset
from transformers import AutoTokenizer

from preprocessing import Preprocessor


ValueError: Unable to compare versions for numpy>=1.17: need=1.17 found=None. This is unusual. Consider reinstalling numpy.

In [18]:
default_ip = '!?.'
default_st_tagging = {"German:\n": "SOURCE ", "English:\n": "SPLIT "}
default_rm_tagging = {"Roots in English: ": "SPLIT ", "Modifiers in English: ": "SPLIT "}

def clean_text(file_path, format="labeled", ignored_punctuation=default_ip, source_target_tagging=default_st_tagging, root_modifier_tagging=default_rm_tagging):
    """
    Assumes strict order of German than English
    Returns two lists, source and target each of which is made up of multiple sentences. (not a list of words). Should
    be better for hugging face interface.
    @param file_path: path to file holding text
    @param format: allows function to cover both types of file inputs. "labeled" returns source and target lists, while
    "unlabeled" returns source + root lists and modifiers list of tuples
    @param ignored_punctuation: list of what punctuation to leave in sentence
    @param source_target_tagging: dictionary for tagging what part of text is target and source
    @param root_modifier_tagging: dictionary for tagging what part of text is root and modifiers
    @return: depending on label either source + target lists or source + roots + modifier lists
    """
    # read file
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # initial cleaning

    pattern = '|'.join(sorted(re.escape(obj) for obj in source_target_tagging))
    tagged_text = re.sub(pattern, lambda m: source_target_tagging.get(m.group(0)), text, flags=re.IGNORECASE)
    if format=="unlabeled":
        pattern2 = '|'.join(sorted(re.escape(obj) for obj in root_modifier_tagging))
        tagged_text = re.sub(pattern2, lambda m: root_modifier_tagging.get(m.group(0)), tagged_text, flags=re.IGNORECASE)
        ignored_punctuation += '()'
        # ignored_punctuation = ignored_punctuation + '()'
    regex_cleaning = dict()
    regex_cleaning.update({'\n': ' '})
    regex_cleaning.update({p:'' for p in string.punctuation if p not in ignored_punctuation})
    clean_text = tagged_text.translate(str.maketrans(regex_cleaning))
    # clean_text = tagged_text
    action_items = clean_text.split("SOURCE")

    # reorganization
    if format == "labeled":
        source_list = list()
        target_list = list()

        for action_item in action_items[1:]:
            source_target_obj = action_item.split("SPLIT")
            source_target_obj = [st_text.strip() for st_text in source_target_obj]
            source_list.append("translate German to English: " + source_target_obj[0])
            target_list.append(source_target_obj[1])
        return source_list, target_list
    elif format == "unlabeled":
        source_list = list()
        root_list = list()
        modifier_list = list()
        for action_index, action_item in enumerate(action_items[1:]):
            source_root_modifier_obj = action_item.split("SPLIT")
            source_root_modifier_obj = [st_text.strip() for st_text in source_root_modifier_obj]
            source_list.append("translate German to English: " + source_root_modifier_obj[0].translate(str.maketrans({"(": "", ")":""})))
            root_list.append(source_root_modifier_obj[1].split(' '))
            modifier_tuple_list = list()
            modifier_tuples = source_root_modifier_obj[2].translate(str.maketrans({"(": "*", ")": "*"})).split("*")
            for tup_index in range(0, len(modifier_tuples)-1, 2):
                modifier_tuple_list.append(tuple(modifier_tuples[tup_index+1].split(' ')))
            modifier_list.append(modifier_tuple_list)
        return source_list, root_list, modifier_list
    else:
        raise("Error: no process completed")

In [4]:
def check_file_statistics(filepath):
    """
    Receives file pathway, loads and cleans data. Then prints statistics about the length of sentences, or root/modifier input
    @param filepath:
    @return:
    """
    format = filepath.split('.')[-1]
    if format=="labeled":
        source_list, target_list = clean_text(filepath, format=format)
        interest_dict = {"source_length": list(), "target_length": list()}
        for s, t in zip(source_list, target_list):
            interest_dict["source_length"].append(len(s.split(' ')))
            interest_dict["target_length"].append(len(t.split(' ')))

    elif format=="unlabeled":
        source_list, root_list, modifier_list = clean_text(filepath, format=format)
        interest_dict = {"source_length": list(), "root_length": list(), "modifier_length": list()}
        for s, r, m in zip(source_list, root_list, modifier_list):
            interest_dict["source_length"].append(len(s.split(' ')))
            interest_dict["root_length"].append(len(r))
            interest_dict["modifier_length"].append(len(m))
    else:
        raise("Does not conform to either either labeled or unlabeled format")
    print(pd.DataFrame.from_dict(interest_dict).describe())




In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
preprocessor = Preprocessor('C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\train.labeled', 'C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\val.labeled', tokenizer)

In [4]:
datasets = preprocessor.preprocess()

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
datasets['train']

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [7]:
datasets['train']['translation']

[{'de': 'translate German to English: Was ist da so falsch gelaufen? Die Wirtschaftskrise scheint die naheliegendste Erklärung zu sein vielleicht zu naheliegend.',
  'en': 'What has gone so wrong? The economic crisis seems to be the most obvious explanation but perhaps too obvious.'},
 {'de': 'translate German to English: Abdullah wollte das Treffen weil er glaubt dass das Weltgeschehen seit dem Jahre 2001 die Bruderschaft der Konservativen gespalten hat. Bis dahin teilten er und Bush eine gemeinsame Weltsicht die die Bedeutung der Religion der traditionellen Familie so wie beider Länder sie auffassten gesellschaftliche Disziplin und die Rolle des Staates als Unterstützer dieser Institutionen betonte.',
  'en': 'Abdullah sought the meeting because he believes that the world since 2001 has divided the fraternity of conservatives. Until then he and Bush shared a common worldview emphasizing the importance of religion the traditional family as both countries understood it social disciplin

In [19]:
train_path = 'C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\train.labeled'
train_source_list, train_target_list = clean_text(train_path)

In [6]:
comp_path = 'C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\comp.unlabeled'
comp_source_list, comp_root_list, comp_modifiers_list = clean_text(comp_path, format="unlabeled")

In [210]:
check_file_statistics(train_path)

       source_length  target_length
count   10000.000000   10000.000000
mean       63.894300      62.265000
std        22.171416      21.010681
min         1.000000       1.000000
25%        49.000000      48.000000
50%        63.000000      61.000000
75%        78.000000      76.000000
max       187.000000     166.000000


In [216]:
check_file_statistics('C:\\Users\\dovid\\PycharmProjects\\NLP\\NLP-HWs\\Project\\data\\val.labeled')

       source_length  target_length
count    1000.000000    1000.000000
mean       64.684000      62.890000
std        22.068511      20.939206
min         1.000000       1.000000
25%        50.000000      50.000000
50%        63.000000      62.000000
75%        79.000000      76.000000
max       179.000000     169.000000


In [215]:
check_file_statistics(comp_path)

       source_length  root_length  modifier_length
count    2000.000000  2000.000000      2000.000000
mean       64.204500     2.855000         2.855500
std        23.265518     1.141327         1.141171
min         1.000000     1.000000         1.000000
25%        49.000000     2.000000         2.000000
50%        63.000000     3.000000         3.000000
75%        78.000000     3.000000         3.000000
max       262.000000    13.000000        13.000000


We conclude from here that most sentences are around 50-80 words long.
The maximum in the total data set is 262 in the comp file. Which is annoying because it is nearly 100 words longer than the longest in train/val.

create custom dataset

In [11]:
train_ds = Dataset.from_pandas(pd.DataFrame({'text': train_source_list, 'label': train_target_list}))

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [28]:
max_input_length = 128
max_target_length = 128


def preprocess_function(dataset):
    source = [text for text in dataset['text']]
    target = [label for label in dataset['label']]
    model_inputs = tokenizer(source, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [24]:
model_inputs = tokenizer(train_source_list, text_target=train_target_list, max_length=128, truncation=True)

In [29]:
tokenized_datasets = train_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [36]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [35]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
import torch

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [7]:
train_target_list[0]

'What has gone so wrong? The economic crisis seems to be the most obvious explanation but perhaps too obvious.'

In [15]:
["translate English to German: " + input_paragraph for input_paragraph in train_source_list[:5]]

['translate English to German: Was ist da so falsch gelaufen? Die Wirtschaftskrise scheint die naheliegendste Erklärung zu sein vielleicht zu naheliegend.',
 'translate English to German: Abdullah wollte das Treffen weil er glaubt dass das Weltgeschehen seit dem Jahre 2001 die Bruderschaft der Konservativen gespalten hat. Bis dahin teilten er und Bush eine gemeinsame Weltsicht die die Bedeutung der Religion der traditionellen Familie so wie beider Länder sie auffassten gesellschaftliche Disziplin und die Rolle des Staates als Unterstützer dieser Institutionen betonte.',
 'translate English to German: Hinsichtlich eines absoluten Niedergangs ist zu sagen dass die USA zwar sehr reale Probleme haben aber die amerikanische Wirtschaft dennoch hoch produktiv bleibt. Amerika liegt an erster Stelle bei den Gesamtausgaben für FampE Hochschulrankings Nobelpreisen und auch bei Unternehmensindizes. Laut Angaben des Weltwirtschaftsforums das letzten Monat seinen Bereicht über wirtschaftliche Wettbe

In [38]:
my_pipeline = pipeline("translation_de_to_en", model=model, tokenizer=tokenizer)

In [41]:
my_pipeline.train(["translate English to German: " + input_paragraph for input_paragraph in train_source_list[0]])

AttributeError: 'TranslationPipeline' object has no attribute 'train'

In [25]:
input_ids = tokenizer(text=["translate English to German: " + input_paragraph for input_paragraph in train_source_list[:5]], max_length=262, truncation=True).input_ids

In [None]:
input_ids = tokenizer(text=["translate English to German: " + input_paragraph for input_paragraph in train_source_list[:5]], max_length=262, truncation=True, return_tensors="pt").input_ids

In [11]:
def show_input():
    yield next(input_ids)

In [33]:
for i in input_ids:
    print(i)

[13959, 1566, 12, 2968, 10, 2751, 229, 836, 78, 21816, 873, 8068, 58, 316, 18209, 157, 7854, 18449, 67, 14462, 15342, 26, 849, 28019, 170, 1110, 10330, 170, 14462, 15342, 26, 5, 1]
[13959, 1566, 12, 2968, 10, 28508, 521, 107, 10329, 211, 29008, 5603, 3, 49, 3, 122, 14802, 17, 602, 211, 3779, 30829, 3646, 340, 3861, 4402, 67, 3, 9465, 588, 3445, 74, 2974, 3473, 1528, 29, 3, 2897, 6459, 324, 3, 547, 5, 6483, 22688, 3, 2919, 324, 3, 49, 64, 8905, 266, 6602, 15, 3779, 7, 362, 17, 67, 67, 16144, 74, 18182, 74, 3, 31341, 7453, 78, 587, 5877, 52, 24886, 680, 219, 12837, 324, 3, 25962, 15, 2678, 702, 10574, 64, 67, 11631, 93, 18122, 15, 7, 501, 3941, 29975, 52, 3, 1878, 14932, 35, 3, 27903, 15, 5, 1]
[13959, 1566, 12, 2968, 10, 9515, 15690, 266, 7, 6097, 29, 14866, 3810, 7, 229, 170, 10167, 602, 67, 2312, 3, 8297, 1319, 490, 15, 12834, 745, 862, 67, 3, 23384, 20870, 177, 12555, 6012, 3, 21536, 23, 208, 9852, 5, 736, 49, 5561, 5282, 46, 3, 21735, 13524, 468, 177, 13347, 2064, 12525, 218, 377, 4

In [None]:
class full_paragraph_T5(Pipeline):
    def __init__(self, tokenizer, model):
        super().__init__(self)
        self.tokenizer = tokenizer
        self.model = model

    def _sanitize_parameters(self, **pipeline_parameters):
        preprocess_kwargs = dict()
        if "max_length" in pipeline_parameters:
            preprocess_kwargs["max_length"] = pipeline_parameters["max_length"]
        if "truncation" in pipeline_parameters:
            preprocess_kwargs["truncation"] = pipeline_parameters["truncation"]
        return preprocess_kwargs, dict(), dict()
    def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
        model_input = self.tokenizer(input_)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from project_evaluate import compute_metrics

In [None]:
28.3

In [2]:
file_name = 'val_337977045_316250877_bleu152.labeled'
to_file = 'val_bleu152_w_German.labeled'
with open(file_name, 'r') as f:
    new_lines = list()
    lines = f.readlines()
    for line in lines:
        new_lines.append(line)
        if line == '\n':
            new_lines.append('German:\n')

with open(to_file, 'w') as f_write:
    for new_line in new_lines:
        f_write.write(new_line)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 7239: character maps to <undefined>