In [1]:
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = "cnn_dailymail"
seed_num = 1
model_name = "google-t5/t5-small"

In [4]:
loaded_dataset = load_dataset(dataset, '3.0.0')

In [5]:
loaded_dataset
# make the dataset into a pandas dataframe
# df = pd.DataFrame(loaded_dataset['train'])
# # add the test dataset to the dataframe
# df = pd.concat([df, pd.DataFrame(loaded_dataset['test'])], ignore_index=True)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [6]:
# Tokenize the summary column
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
prefix = "summarize: "  # Required so the T5 model knows that we are going to summarize
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(text_target=examples["highlights"])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
tokenized_dataset = loaded_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 287113/287113 [03:43<00:00, 1285.23 examples/s]
Map: 100%|██████████| 13368/13368 [00:10<00:00, 1222.92 examples/s]
Map: 100%|██████████| 11490/11490 [00:09<00:00, 1227.36 examples/s]


In [9]:
# Make the dataset into a Dataframe
df = pd.DataFrame(tokenized_dataset['train'])
df.tail()

Unnamed: 0,article,highlights,id,input_ids,attention_mask,labels
287108,"The nine-year-old daughter of a black, unarmed...","Rumain Brisbon, 34, was killed after Phoenix p...",279a12d3ee37b8109cc192a9e88115a5a631fb06,"[21603, 10, 37, 4169, 18, 1201, 18, 1490, 3062...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2770, 7484, 7834, 7, 5407, 6, 6154, 6, 47, 47..."
287109,Legalising assisted suicide is a slippery slop...,"Theo Boer, a European assisted suicide watchdo...",b5bc9d404a9a5d890c9fc26550b67e6d8d83241f,"[21603, 10, 11281, 4890, 11752, 12259, 19, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 32, 1491, 49, 6, 3, 9, 1611, 11752, 12259..."
287110,A group calling itself 'The Women of the 99 Pe...,Ohio congressman criticised for 'condoning the...,500862586f925e406f8b662934e1a71bbee32463,"[21603, 10, 71, 563, 3874, 1402, 3, 31, 634, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6167, 27197, 348, 6800, 3375, 21, 3, 31, 1018..."
287111,Most men enjoy a good pint of lager or real al...,The Black Country Ale Tairsters have been to 1...,32a1f9e5c37a938c0c0bca1a1559247b9c4334b2,"[21603, 10, 1377, 1076, 777, 3, 9, 207, 4522, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 1589, 6993, 15345, 332, 2256, 1370, 7, 43..."
287112,A Facebook page seeking to preserve the 'Black...,Facebook page supporting tradition gains one m...,8ec9ff4d633dd4cc26d53f503c33f7464b43c36e,"[21603, 10, 71, 1376, 543, 3945, 12, 8996, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1376, 543, 3956, 4387, 11391, 80, 770, 3, 31,..."


In [11]:
from pprint import pprint
pprint(df['article'][0])

('LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access '
 'to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, '
 "but he insists the money won't cast a spell on him. Daniel Radcliffe as "
 'Harry Potter in "Harry Potter and the Order of the Phoenix" To the '
 'disappointment of gossip columnists around the world, the young actor says '
 'he has no plans to fritter his cash away on fast cars, drink and celebrity '
 'parties. "I don\'t plan to be one of those people who, as soon as they turn '
 '18, suddenly buy themselves a massive sports car collection or something '
 'similar," he told an Australian interviewer earlier this month. "I don\'t '
 'think I\'ll be particularly extravagant. "The things I like buying are '
 'things that cost about 10 pounds -- books and CDs and DVDs." At 18, '
 'Radcliffe will be able to gamble in a casino, buy a drink in a pub or see '
 'the horror film "Hostel: Part II," currently six places below his number

In [12]:
# Give me the percentiles of length of input_ids using pandas and plot them
df['input_ids'].apply(len).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count    287113.000000
mean        985.055038
std         480.603123
min          20.000000
25%         631.000000
50%         898.000000
75%        1244.000000
90%        1659.000000
95%        1947.000000
99%        2405.000000
max        5269.000000
Name: input_ids, dtype: float64

In [13]:
# The same for the labels
df['labels'].apply(len).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count    287113.000000
mean         74.682811
std          30.752373
min           7.000000
25%          55.000000
50%          70.000000
75%          87.000000
90%         110.000000
95%         129.000000
99%         171.000000
max        3151.000000
Name: labels, dtype: float64