In [15]:
import sys

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from nlp481.distillation import (
    getEmptyFrameDict,
    cacheFrameDict,
    loadFrameDict,
    inferDataFrameDict,
    loadDatasetFromCachedDataframe
)

In [16]:

tokenizer = AutoTokenizer.from_pretrained("kssteven/mT5-large-iwslt2017-de-en")
model = AutoModelForSeq2SeqLM.from_pretrained("kssteven/mT5-large-iwslt2017-de-en")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [17]:
DEVICE = "cuda:0"

model.eval()
model.to(DEVICE)

MT5ForConditionalGeneration(
  (shared): Embedding(250100, 1024)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250100, 1024)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=Fals

In [18]:
dataset = load_dataset("iwslt2017", "iwslt2017-de-en")

INPUT_KEY = "en"
DATASET_NAME = "iwslt2017-de-en"

In [19]:
for curr_name, curr_dataset in dataset.items():
    curr_dataset = curr_dataset.add_column("en", [col["translation"]["en"] for col in curr_dataset])
    curr_dataset = curr_dataset.add_column("de", [col["translation"]["de"] for col in curr_dataset])
    curr_dataset = curr_dataset.remove_columns(["translation"])

    dataset[curr_name] = curr_dataset

In [20]:
dataset["train"]

Dataset({
    features: ['en', 'de'],
    num_rows: 206112
})

## Run Cells if Creating Fresh Frame Dict

In [7]:
ds_frame_dict = getEmptyFrameDict(INPUT_KEY, "t5_large_output", dataset)

In [None]:
cacheFrameDict("./cache", ds_frame_dict, DATASET_NAME)

## Run Cell if Using Cached Frame Dict

In [5]:
ds_frame_dict = loadFrameDict("./cache", dataset.keys(), DATASET_NAME)

In [13]:
ds_frame_dict["train"].iloc[206111]["t5_large_output"]

"Translate English to German: Frankenstein's monster: SS: Thank you."

In [22]:
inferDataFrameDict(
    ds_frame_dict,
    model,
    tokenizer,
    16,
    cache_location = "./foobar",
    dataset_name = DATASET_NAME,
    input_key = INPUT_KEY,
    batches_per_cache_write = 64,
    prefix = "translate English to German: ",
)

# Move model out of VRAM (so NLPG admins don't get mad at us)
model.to("cpu")

# For some reason moving the model to cpu doesn't actually free VRAM
# so just exit from the process
sys.exit(0)

  return bound(*args, **kwds)
100%|██████████| 12883/12883 [00:01<00:00, 11458.36it/s]


OSError: Cannot save file into a non-existent directory: 'foobar'

In [None]:
ds_frame_dict["train"]

In [None]:
distill_dataset = loadDatasetFromCachedDataframe("./cache", dataset.keys(), DATASET_NAME)

In [None]:
distill_dataset["train"]

In [None]:
for curr_name, curr_dataset in distill_dataset.items():
    curr_base_dataset = dataset[curr_name]
    curr_base_columns = set(curr_base_dataset.column_names)
    curr_base_columns.remove(INPUT_KEY)

    for curr_col_name in curr_base_columns:
        curr_dataset = curr_dataset.add_column(
            curr_col_name,
            dataset[curr_name][curr_col_name]
        )

    distill_dataset[curr_name] = curr_dataset

In [None]:
distill_dataset["train"]

In [None]:
distill_dataset.push_to_hub(
    f"lilferrit/cnn_dailymail_t5_distillation",
    revision = "cnndm-checkpoints"
)