In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_from_disk
from datasets import Dataset, DatasetDict
from typing import Dict, List
from functools import partial
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large")

In [3]:
DEVICE = "cuda:1"

model.eval()
model.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [4]:
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")

## Utility Functions

In [5]:
def getEmptyFrameDict(
    input_key: str,
    output_key: str,
    dataset_dict: DatasetDict
) -> Dict[str, pd.DataFrame]:
    dataframe_dict = dict()

    for curr_name, curr_dataset in dataset_dict.items():
        next_dataframe = pd.DataFrame.from_dict({
            input_key: curr_dataset[input_key],
            output_key: [""] * len(curr_dataset[input_key])
        })

        dataframe_dict[curr_name] = next_dataframe

    return dataframe_dict

def cacheFrame(
    data_frame: pd.DataFrame,
    cache_dir: Path,
    cache_entry_name: str
) -> None:
    file_name = f"{cache_entry_name}.cache.parquet"
    data_frame.to_parquet(
        os.path.join(
            cache_dir,
            file_name
        ),
        engine = "pyarrow",
        compression = None
    )

def cacheFrameDict(
    cache_dir: Path,
    dataframe_dict: Dict[str, pd.DataFrame],
    prefix_name: str = None,
) -> None:
    for curr_name, curr_dataframe in dataframe_dict.items():
        file_name = curr_name

        if prefix_name is not None:
            file_name = f"{prefix_name}_{file_name}"

        cacheFrame(
            curr_dataframe,
            cache_dir,
            file_name
        )

def loadFrameDict(
    cache_dir: Path,
    dataframe_names: List[str],
    prefix_name: str = None,
) -> Dict[str, pd.DataFrame]:
    dataframe_dict = dict()

    for curr_name in dataframe_names:
        file_name = f"{curr_name}.cache.parquet"

        if prefix_name is not None:
            file_name = f"{prefix_name}_{file_name}"

        dataframe_dict[curr_name] = pd.read_parquet(
            os.path.join(
                cache_dir,
                file_name
            ),
            engine = "pyarrow",
        )

    return dataframe_dict

## Run Cells if Creating Fresh Frame Dict

In [6]:
cnn_frame_dict = getEmptyFrameDict("article", "t5_large_output", cnn_dataset)

In [7]:
cacheFrameDict("./cache", cnn_frame_dict, "cnn_dm_distill")

## RUn Cell if Using Cached Frame Dict

In [6]:
cnn_frame_dict = loadFrameDict("./cache", cnn_dataset.keys(), "cnn_dm_distill")

In [7]:
def inferDataFrameDict(
    dataframe_dict: Dict[str, pd.DataFrame],
    batch_size: int,
    input_key: str = "article",
    output_key: str = "t5_large_output",
    prefix: str = "summarize: ",
    max_input_length: int = 512,
    max_output_length: int = 512,
    cache_location: str = None,
    dataset_name: str = None,
    batches_per_cache_write: int = None
) -> None:
    using_cache = all(x is not None for x in [cache_location, dataset_name, batches_per_cache_write])
    model.eval()

    for curr_name, curr_dataframe in dataframe_dict.items():
        chunks_iter = np.array_split(curr_dataframe, (len(curr_dataframe) // batch_size) + 1)
        row_counter = 0
        
        for chunk_idx, curr_chunk in enumerate(tqdm(chunks_iter)):
            curr_chunk_inputs = list(curr_chunk[input_key])
            curr_chunk_outputs = list(curr_chunk[output_key])
            is_cached = all(x != "" for x in curr_chunk_outputs)

            if is_cached:
                row_counter += len(curr_chunk_outputs)
                continue

            inputs = [prefix + doc for doc in curr_chunk_inputs]
            input_ids = tokenizer(
                inputs, 
                return_tensors = "pt",
                max_length = max_input_length,
                truncation = True,
                padding = True,
            ).input_ids.to(DEVICE)

            outputs = model.generate(input_ids, max_new_tokens = max_output_length)
            outputs.to("cpu")
            decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens = True)

            out_column_index = curr_dataframe.columns.get_loc(output_key)
            end_row_index = row_counter + len(decoded_output)
            curr_dataframe.iloc[row_counter : end_row_index, out_column_index] = decoded_output
            row_counter += len(curr_chunk_outputs)

            if (((chunk_idx + 1) % batches_per_cache_write) == 0) and using_cache:
                cacheFrame(
                    curr_dataframe,
                    cache_location,
                    f"{dataset_name}_{curr_name}"
                )
            
        cacheFrame(
            curr_dataframe,
            cache_location,
            f"{dataset_name}_{curr_name}"
        )

inferDataFrameDict(
    cnn_frame_dict,
    16,
    cache_location = "./cache",
    dataset_name = "cnn_dm_distill",
    batches_per_cache_write = 64
)

# Move model out of VRAM (so NLPG admins don't get mad at us)
model.to("cpu")

  return bound(*args, **kwds)
100%|██████████| 17945/17945 [4:43:59<00:00,  1.05it/s]  
  return bound(*args, **kwds)
100%|██████████| 836/836 [50:01<00:00,  3.59s/it]
100%|██████████| 719/719 [43:05<00:00,  3.60s/it]


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [9]:
cnn_frame_dict["validation"]

Unnamed: 0,article,t5_large_output
0,"(CNN)Share, and your gift will be multiplied. ...",Zully broussard gave one of her kidneys to a s...
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",MLS is celebrating its 20th season. the number...
2,"(CNN)French striker Bafetimbi Gomis, who has a...",french striker bafetimbi gomis says he is now ...
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy launches 3-iron into lake after d...
4,(CNN)A Pennsylvania community is pulling toget...,the parents of cayman naib have been communica...
...,...,...
13363,"It is the dream of many young children, the ch...","the Wild west town in valley center, californi..."
13364,It’s the type of encounter that can send panic...,photographer Graham hewer captured the jaw-dro...
13365,A group of tourists to the Bahamas enjoyed one...,the pigs were filmed on a boat on the island o...
13366,Pippa Middleton bundled up against the London ...,the brunette is back in London following news ...


In [10]:
# Hugging Face doesn't provide an easy way to load a DatasetDict from Pandas,
# so loadDatasetFromCachedDataframe pulls the latest cache entry instead
def loadDatasetFromCachedDataframe(
    cache_dir: Path,
    dataframe_names: List[str],
    prefix_name: str = None,
) -> DatasetDict:
    file_dict = dict()

    for curr_name in dataframe_names:
        file_name = f"{curr_name}.cache.parquet"

        if prefix_name is not None:
            file_name = f"{prefix_name}_{file_name}"

        file_dict[curr_name] = os.path.join(cache_dir, file_name)

    return DatasetDict.from_parquet(file_dict)

cnn_distill_dataset = loadDatasetFromCachedDataframe("./cache", cnn_dataset.keys(), "cnn_dm_distill")

Generating train split: 287113 examples [00:22, 12878.38 examples/s]
Generating validation split: 13368 examples [00:00, 23574.17 examples/s]
Generating test split: 11490 examples [00:00, 24219.57 examples/s]


In [11]:
cnn_dataset["train"]

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [25]:
for curr_name, curr_dataset in cnn_distill_dataset.items():
    curr_dataset = curr_dataset.add_column(
        "highlights",
        cnn_dataset[curr_name]["highlights"]
    )

    curr_dataset = curr_dataset.add_column(
        "id",
        cnn_dataset[curr_name]["id"]
    )

    cnn_distill_dataset[curr_name] = curr_dataset

In [27]:
cnn_distill_dataset.push_to_hub(
    "lilferrit/cnn_dailymail_t5_distillation",
    config_name = "3.0.0"
)

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char