In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!mkdir data
!cp "/content/drive/MyDrive/KAGGLE DATASETS/english2german/deu.txt.zip" "data/data.txt.zip"
!unzip "data/data.txt.zip" -d "data"

mkdir: cannot create directory ‘data’: File exists
Archive:  data/data.txt.zip
replace data/deu.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: data/deu.txt            


In [3]:
!pip install -q transformers[sentencepiece] datasets evaluate sacrebleu accelerate

In [4]:
from datasets import load_dataset
dataset = load_dataset("text", data_files="data/deu.txt")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 221533
    })
})

In [6]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 177226
    })
    test: Dataset({
        features: ['text'],
        num_rows: 44307
    })
})

In [8]:
dataset["train"][0]["text"].split("\t")

['Fish and visitors stink after three days.',
 'Fische und Besucher stinken nach drei Tagen.',
 'CC-BY 2.0 (France) Attribution: tatoeba.org #6537529 (CK) & #6540297 (Manfredo)']

In [9]:
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM

In [10]:
model_ckpt = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_ckpt)

In [11]:
prefix = "translate English to German:"

In [12]:
def preprocess(examples):
  inputs = []
  targets = []
  for example in examples["text"]:
    try:
      english, german, _ = example.split("\t")
      inputs.append(prefix + " " + english)
      targets.append(german)
    except:
      print(example.split("\t"))

  model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

  return model_inputs

In [13]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/177226 [00:00<?, ? examples/s]

Map:   0%|          | 0/44307 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 177226
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 44307
    })
})

In [15]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 177226
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 44307
    })
})

In [16]:
tokenized_dataset["train"][0]

{'input_ids': [13959,
  1566,
  12,
  2968,
  10,
  9548,
  11,
  2692,
  29704,
  227,
  386,
  477,
  5,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [21252, 15, 64, 13894, 29704, 35, 559, 4052, 13657, 5, 1]}

In [17]:
tokenizer.decode(tokenized_dataset["train"][0]["input_ids"])

'translate English to German: Fish and visitors stink after three days.</s>'

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [20]:
import evaluate

metric = evaluate.load("sacrebleu")

In [21]:
import numpy as np

In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [23]:
def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["score"]}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result

In [24]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [25]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
training_args = Seq2SeqTrainingArguments(
  output_dir="english2german",
  evaluation_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  weight_decay=0.01,
  save_total_limit=3,
  num_train_epochs=2,
  predict_with_generate=True,
  fp16=True,
  push_to_hub=True,
)

In [29]:
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset["test"],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

In [30]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9638,0.80446,41.2212,11.2169




Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9638,0.80446,41.2212,11.2169
2,0.94,0.788179,41.8343,11.2308




TrainOutput(global_step=22154, training_loss=0.9766485357667669, metrics={'train_runtime': 3788.4414, 'train_samples_per_second': 93.561, 'train_steps_per_second': 5.848, 'total_flos': 2044312534646784.0, 'train_loss': 0.9766485357667669, 'epoch': 2.0})

In [31]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

'https://huggingface.co/ronit33/my_awesome_opus_books_model/tree/main/'

# Inference

In [36]:
def predict(text):
  from transformers import pipeline
  pipe = pipeline(model="ronit33/english2german")
  prefix = "translate English to German:"

  input = f"{prefix} {text}"

  output = pipe(input)

  return output[0]["generated_text"]

In [45]:
# google translator result: Hallo, ich bin Ronit. Ich mag NLP.
predict("Hello, I am Ronit. I like NLP.")

'Hallo, ich bin Ronit. Ich mag NLP.'