In [2]:
!git clone https://huggingface.co/datasets/fabiochiu/medium-articles

Cloning into 'medium-articles'...
remote: Enumerating objects: 22, done.[K
remote: Total 22 (delta 0), reused 0 (delta 0), pack-reused 22[K
Unpacking objects: 100% (22/22), 3.42 KiB | 166.00 KiB/s, done.
Filtering content: 100% (3/3), 1.29 GiB | 38.25 MiB/s, done.


In [3]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

import nltk
nltk.download('punkt')
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
model_checkpoint = "t5-small"

In [6]:
medium_datasets = load_dataset("csv", data_files="medium-articles/medium_articles.csv")
medium_datasets

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 192368
    })
})

In [7]:
medium_datasets = medium_datasets.filter(lambda example: (len(example['text']) >= 500) and (len(example['title']) >= 20))

Filter:   0%|          | 0/192368 [00:00<?, ? examples/s]

In [8]:
datasets_train_test = medium_datasets["train"].train_test_split(test_size=8000)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=8000)

medium_datasets["train"] = datasets_train_validation["train"]
medium_datasets["validation"] = datasets_train_validation["test"]
medium_datasets["test"] = datasets_train_test["test"]

medium_datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 148643
    })
    validation: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags'],
        num_rows: 8000
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [10]:
prefix = "summarize: "

max_input_length = 512
max_target_length = 64

def clean_text(text):
  sentences = nltk.sent_tokenize(text.strip())
  sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
  sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                 if len(sent) > 0 and
                                 sent[-1] in string.punctuation]
  text_cleaned = "\n".join(sentences_cleaned_no_titles)
  return text_cleaned

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["text"]]
  inputs = [prefix + text for text in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length,
                           padding='max_length', truncation=True)

  # Setup the tokenizer for targets
  labels = tokenizer(text_target=examples["title"],
                     max_length=max_target_length,
                     padding='max_length',
                     truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  model_inputs["decoder_attention_mask"] = labels["attention_mask"]
  return model_inputs

In [11]:
medium_datasets['train'] = medium_datasets['train'].shuffle(seed=10).select(range(100000))

In [12]:
n_samples_train = len(medium_datasets["train"])
n_samples_validation = len(medium_datasets["validation"])
n_samples_test = len(medium_datasets["test"])
n_samples_total = n_samples_train + n_samples_validation + n_samples_test

print(f"- Training set: {n_samples_train*100/n_samples_total:.2f}%")
print(f"- Validation set: {n_samples_validation*100/n_samples_total:.2f}%")
print(f"- Test set: {n_samples_test*100/n_samples_total:.2f}%")

- Training set: 86.21%
- Validation set: 6.90%
- Test set: 6.90%


In [13]:
medium_datasets = medium_datasets.map(preprocess_data, batched=True, num_proc=3)
medium_datasets

Map (num_proc=3):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/8000 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/8000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 100000
    })
    validation: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['title', 'text', 'url', 'authors', 'timestamp', 'tags', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 8000
    })
})

In [14]:
medium_datasets.save_to_disk('medium_articles_t5_small_tokenized.hf')

Saving the dataset (0/2 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8000 [00:00<?, ? examples/s]

In [15]:
!zip -r /content/medium_articles_t5_small_tokenized.hf.zip /content/medium_articles_t5_small_tokenized.hf

  adding: content/medium_articles_t5_small_tokenized.hf/ (stored 0%)
  adding: content/medium_articles_t5_small_tokenized.hf/validation/ (stored 0%)
  adding: content/medium_articles_t5_small_tokenized.hf/validation/data-00000-of-00001.arrow (deflated 68%)
  adding: content/medium_articles_t5_small_tokenized.hf/validation/state.json (deflated 38%)
  adding: content/medium_articles_t5_small_tokenized.hf/validation/dataset_info.json (deflated 80%)
  adding: content/medium_articles_t5_small_tokenized.hf/test/ (stored 0%)
  adding: content/medium_articles_t5_small_tokenized.hf/test/data-00000-of-00001.arrow (deflated 68%)
  adding: content/medium_articles_t5_small_tokenized.hf/test/state.json (deflated 38%)
  adding: content/medium_articles_t5_small_tokenized.hf/test/dataset_info.json (deflated 80%)
  adding: content/medium_articles_t5_small_tokenized.hf/train/ (stored 0%)
  adding: content/medium_articles_t5_small_tokenized.hf/train/data-00001-of-00002.arrow (deflated 68%)
  adding: conte