In [1]:
from transformers import AutoTokenizer

from datasets import load_dataset

In [2]:
raw_datasets = load_dataset("xsum")
raw_datasets

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default to C:/Users/Luka/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to C:/Users/Luka/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [5]:
raw_datasets["train"][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

In [3]:
num_of_samples = 100
raw_datasets['train'] = raw_datasets['train'].select(
    list(range(num_of_samples)))
raw_datasets['validation'] = raw_datasets['validation'].select(
    list(range(num_of_samples)))
raw_datasets['test'] = raw_datasets['test'].select(list(range(num_of_samples)))
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
})

In [6]:
max_input_length = 512
max_target_length = 128
prefix = "summarize: "

tokenizer = AutoTokenizer.from_pretrained(
    "t5-small", model_max_length=max_input_length)


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-2ec103b516375177.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-16811cb70e9d49de.arrow


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [13]:
tokenized_datasets["train"][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142',
 'input_ids': [21603,
  10,
  37,
  423,
  583,
  13,
  1783,
  16,
  20126,
  16496,
  6,
  80,
  13,
  8,
  844,
  6025,
  4161,
  6,
  19,
  341,
  271,
  14841,
  5,
  7057,
  161,
  19,
  4912,
  16,
  1626,
  5981,
  11,
  186,
  7540,
  16,
  1276,
  15,
  2296,
  7,
  5718,
  2367,
  14621,
  4161,
  57,
  4125,
  387,
  5,
  15059,
  7,
  30,
  8,
  4653,
  4939,
  711,
  747,
  522,
  17879,
  788,
  12,
  1783,
  44,
  8,
  15763,
  6029,
  1813,
  9,
  7472,
  5,
  1404,
  1623,
  11,
  5699,
  277,
  130,
  4161,
  57,
  18368,
  16,
  20126,
  16496,
  227,
  8,
  2473,
  5895,
  15,
  147,
  89,
  22411,
  139,
  8,
  1511,
  5,
  1485,
  3271,
  3,
  21926,
  9,
  472,
  19623,
  5251,
  8,
  616,
  12,
  15614,
  8,
  1783,
  5,
  37,
  13818,
  10564,
  15,
  26,
  3,
  9,
  3,
  19513,
  1481,
  6,
  18368,
 