In [None]:
!nvidia-smi

Thu Dec  1 05:51:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    33W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers
!pip install datasets
!pip install sacrebleu



In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric

In [None]:
!pip install evaluate
!pip install bert_score
!pip install -U nltk



In [None]:
raw_datasets = load_dataset("wmt16", "de-en")

from evaluate import load

bleu_metric = load_metric("sacrebleu")
meteor_metric = load("meteor")
bert_metric = load("bertscore")

Found cached dataset wmt16 (/root/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

  """
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
prefix = "translate English to German: "

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "de"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/4549 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model.cuda()

In [None]:
i = 0
for name, param in model.named_parameters():
  if i == 0:
    i += 1
    continue
  split_name = name.split(".")
  if split_name[2] == '0' or split_name[2] == '1' or split_name[2] == '2':
    param.requires_grad = False

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
!pip install wandb



In [None]:
!wandb login
%env WANDB_PROJECT=rl4nmt

[34m[1mwandb[0m: Currently logged in as: [33mtgummadi[0m ([33mrl4nmt[0m). Use [1m`wandb login --relogin`[0m to force relogin
env: WANDB_PROJECT=rl4nmt


In [None]:
def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions[0]

  pred_ids[pred_ids == -100] = tokenizer.pad_token_id
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  labels_ids[labels_ids == -100] = tokenizer.pad_token_id
  label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  bert_score = 0
  bert_results = bert_metric.compute(predictions=pred_str, references=label_str, model_type="distilbert-base-uncased")
  bert_score += bert_results['f1'][0]

  label_str_list = [[i] for i in label_str]
  bleu_score = 0
  bleu_results = bleu_metric.compute(predictions=pred_str, references=label_str_list)
  bleu_score += bleu_results['score']

  meteor_score = 0
  meteor_results = meteor_metric.compute(predictions=pred_str, references=label_str_list)
  meteor_score += meteor_results['meteor']

  out_dict = {'bert_score': bert_score, 'bleu_score': bleu_score, 'meteor_score': meteor_score}

  return out_dict

In [None]:
import torch
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=25000,
    save_strategy="epoch", 
    learning_rate=0.001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=10,
    num_train_epochs=3,
    fp16=True,
    report_to="wandb",
    run_name="T5-Small-CELoss-Finetuining-Continued"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

PyTorch: setting up devices
Using cuda_amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation. If translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4548885
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 852918
  Number of trainable parameters = 38478336
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Bert Score,Bleu Score,Meteor Score
25000,2.3994,1.95901,0.779229,8.032736,0.367109
50000,2.3559,1.903375,0.726762,8.190628,0.374488
75000,2.3177,1.871161,0.722979,8.543955,0.383941
100000,2.2863,1.853382,0.726249,8.540554,0.375428
125000,2.2887,1.833194,0.714037,8.760453,0.387695
150000,2.2128,1.821332,0.813679,8.769802,0.380694
175000,2.2593,1.81238,0.722872,8.903478,0.38845
200000,2.2281,1.794205,0.715558,8.904312,0.380543
225000,2.2436,1.80147,0.733707,8.955922,0.392675
250000,2.2468,1.793819,0.713742,8.884334,0.387051


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation. If translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2169
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation. If translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2169
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation. If translation are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2169
  Batch

TrainOutput(global_step=852918, training_loss=2.1689379707604006, metrics={'train_runtime': 82280.5974, 'train_samples_per_second': 165.855, 'train_steps_per_second': 10.366, 'total_flos': 2.842460380713124e+17, 'train_loss': 2.1689379707604006, 'epoch': 3.0})

In [None]:
from transformers import pipeline

model_checkpoint = "results/checkpoint-284000"
translator = pipeline("translation_en_to_de", model=model_checkpoint)
print(translator("How are you?"))

input_ids = tokenizer("translate English to German: how are you", return_tensors="pt").input_ids.to('cuda')
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

loading configuration file results/checkpoint-284000/config.json
Model config T5Config {
  "_name_or_path": "results/checkpoint-284000",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_d

[{'translation_text': 'Wie sind Sie?'}]
Wie sind Sie


