In [None]:
%%capture
! pip install datasets
! pip install transformers -U
! pip install accelerate -U
! pip install evaluate
! pip install bleu
! pip install python-Levenshtein
! pip install wandb

In [None]:
from typing import Dict, List, Tuple
from dataclasses import dataclass
from tqdm import tqdm
import numpy as np
import torch
import pandas as pd
from datasets import Dataset, DatasetDict

from transformers import T5ForConditionalGeneration, T5Tokenizer

SEED = 999
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [None]:
LOG = False

if LOG:
  import wandb
  wandb.login()

  import os
  os.environ["WANDB_PROJECT"] = "Seq2SeqZip"

# Dataset

In [None]:
import pandas as pd

df = pd.read_csv('shorthex2hex.csv')
df = df[:8000]
print(df.head())
df['deflate_hex'] = [elem + "</s>" for elem in df['deflate_hex']]
df['text_hex'] = [elem + "</s>" for elem in df['text_hex']]

                             text  \
0                One of the other   
1  A wonderful little production.   
2              I thought this was   
3      Basically there's a family   
4        Petter Mattei's "Love in   

                                            text_hex  \
0                   4f6e65206f6620746865206f74686572   
1  4120776f6e64657266756c206c6974746c652070726f64...   
2               492074686f75676874207468697320776173   
3  4261736963616c6c79207468657265277320612066616d...   
4   506574746572204d6174746569277320224c6f766520696e   

                                         deflate_hex  
0         789cf3cf4b55c84f5328c9005240a208002eb405bb  
1  789c735428cfcf4b492d4a2bcd51c8c92c29c949552828...  
2   789cf35428c9c82f4dcf2801d299c50ae589c5003dea06b0  
3  789c734a2cce4c4eccc9a95428c9482d4a552f56485448...  
4  789c0b482d29492d52f04d045299eac50a4a3ef965a90a...  


In [None]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

print(ds_splits)

DatasetDict({
    train: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 6400
    })
    valid: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text', 'text_hex', 'deflate_hex'],
        num_rows: 800
    })
})


In [None]:
ds_splits['train'][0]

{'text': 'I just finished watching',
 'text_hex': '49206a7573742066696e6973686564207761746368696e67</s>',
 'deflate_hex': '789cf354c82a2d2e5148cbcccb2cce484d51284f2c49cec8cc4b07006d1d090f</s>'}

# Model

In [None]:
# Load model directly
from transformers import AutoTokenizer, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
@dataclass
class DataCollatorSeq2SeqWithPadding:
    tokenizer: T5Tokenizer

    def __call__(self, dataset_elements) -> Dict[str, torch.Tensor]:

        # collect the input and output sequences
        input_text = [de["text_hex"] for de in dataset_elements]
        output_text = [de["deflate_hex"] for de in dataset_elements]

        # tokenize both sequences in batch so that it will be much faster!
        input_features = self.tokenizer(
            input_text,
            return_tensors="pt",  # output directly tensors
            padding=True, # add the padding on each sequence if needed
            truncation=True # If the input sequence is too long, truncate it
        )

        output_features = self.tokenizer(
            output_text,
            return_tensors="pt",
            padding=True,
            truncation=True
        )["input_ids"]  # here we only need the input_ids (output actually)

        output_features[output_features==self.tokenizer.pad_token_id] = -100 # cross entropy ignore index

        # This is the only parameters we need for the forward pass
        # to understand why, take a look to the T5ForConditionalGeneration.forward method signature.
        batch = {
            "input_ids": input_features["input_ids"],
            "attention_mask": input_features["attention_mask"],
            "labels": output_features,
        }

        return batch

In [None]:
data_collator = DataCollatorSeq2SeqWithPadding(tokenizer)

# Trainer

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="temp",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    warmup_steps=500,
    max_steps=10000,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=250,
    eval_steps=1000,  # evaluate on the validation every "eval_steps"
    logging_steps=1000,  # log standard metrics each "logging_steps"
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
    predict_with_generate=True,
    save_strategy = "no"
)

In [None]:
## UNUSED FOR NOW
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print(f"Decoded preds = {decoded_preds}\n\n")
    print(f"Decoded labels = {decoded_labels}")

    #result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return 0

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds_splits["train"],
    eval_dataset=ds_splits["valid"],
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtommasobersani[0m ([33mseq2seqzip[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
1000,3.0295,2.111135
2000,2.1477,1.733593
3000,1.8589,1.407287
4000,1.5885,1.161103
5000,1.3681,1.023412
6000,1.2311,0.947465
7000,1.1479,0.901653
8000,1.0893,0.873721
9000,1.0576,0.85764
10000,1.0351,0.852755


TrainOutput(global_step=10000, training_loss=1.5553627319335936, metrics={'train_runtime': 2199.496, 'train_samples_per_second': 36.372, 'train_steps_per_second': 4.546, 'total_flos': 3251816105656320.0, 'train_loss': 1.5553627319335936, 'epoch': 12.5})

## Save model if necessary

# TEST

In [None]:
test_dataloader = torch.utils.data.DataLoader(ds_splits["test"], batch_size=8, collate_fn=data_collator)

In [None]:
gold_strings = []
predicted_strings = []

model.eval()
for step, batch in enumerate(tqdm(test_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.inference_mode():

            generated_tokens = (
                model.generate(
                    input_ids=batch["input_ids"].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )

            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

            # turn subwords ids back into text
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            #print("Gold summary: ", decoded_labels)
            #print("Predicted summary: ", decoded_preds)

            gold_strings.extend(decoded_labels)
            predicted_strings.extend(decoded_preds)


    del generated_tokens, labels, batch

100%|██████████| 100/100 [01:57<00:00,  1.18s/it]


In [None]:
import nltk
from nltk.metrics.distance import edit_distance

assert len(predicted_strings) == len(gold_strings)

scores = []
pred_lenghts = []
gold_lenghts = []

for i in range(len(predicted_strings)):
    pred = predicted_strings[i]
    gold = gold_strings[i]

    scores.append(edit_distance(pred, gold))
    pred_lenghts.append(len(pred))
    gold_lenghts.append(len(gold))

print(f"Average prediction lenght is {np.mean(pred_lenghts)}")
print(f"Average gold lenght is {np.mean(gold_lenghts)}")
print(f"Average distance is {np.mean(scores)}")

789cf32c51284cc94cc9572849cc4e5528c9cf03002eeb06ef
789cf32c5148c94cc9532f512849cc4e5528c9cf070039fe064f
789c0bc9c82c5670c9485548ce4ccd53704bcdc9492d02002ad0bb
789c0bc9c82c5630b4343556084ecc55702bcdc9492d02003edf0656
789cf35448cb2c2a2e51284e2c5728c9c82c06002eff05cd
789cf35448cb2c2a2e51284e2c5728c9c82c06002f5005d5
789c0bc9482d4e5528ce492cce482d5228c82c4e2d56482c4a05003db05b4
789c0bc9482d4e5528ce492cce482d5228c84c2e56482c4a05005ecd0833
789c73cc53c8cc2b492d4a2d2ec9cc4b57c8cd2fcb4c5528cf2cc900004ab0bb
789c73cc53c8cc2b492d4a2d2ec9cc4b57c8cd2fcb4c5528cf2cc900007a970998
789c73ce2c2ac82c2ec9cc4b55f0492cc94c51f0484ecc4ba954284fcdcb492c01003eb0bb
789c73ce28ca2c2ec9cc4b55f049cc28c954d0084ecc4ba9547029cdcb492cd00400a95e0ab0
789cf34e2c4acd5348cc4b51c8482d5248caaf4b2dca4ccd4b01003dbb06b7
789cf34e2c4acd5348cc4b51c8482d5248caaf4c2bca4ccd4b010062d30886
789cf3542f5128482c2e4e4d5128c9c82c5628c9c82c49c905003db05cd
789cf3544f5128482c2e4e4d5128c9c82c5628c92cc94905004e60078f
789c73492d2ec92c4955c8492c29492d52