In [2]:
import os
import zipfile 
import gdown

In [4]:
data_dir = "../artifacts/data_ingestion"
url = "https://drive.google.com/uc?id=183Q4dQHRHc6YuIiYrquREdrAgxQGGqAU"

os.makedirs(data_dir, exist_ok=True)

In [8]:
if not os.path.exists(data_dir+'data.zip'):
    gdown.download(url, output=data_dir+"/data.zip")

Downloading...
From (uriginal): https://drive.google.com/uc?id=183Q4dQHRHc6YuIiYrquREdrAgxQGGqAU
From (redirected): https://drive.google.com/uc?id=183Q4dQHRHc6YuIiYrquREdrAgxQGGqAU&confirm=t&uuid=11870b17-22ce-491e-9bfd-44db247dffe8
To: d:\WorkSpace\NLP\Practice\Text-Summarization\artifacts\data_ingestion\data.zip
100%|██████████| 528M/528M [00:41<00:00, 12.8MB/s] 


In [9]:
from zipfile import ZipFile 

if not os.path.exists(os.path.join(data_dir, 'data', 'train.csv')):
    with ZipFile(data_dir+'/data.zip', 'r') as file:
        file.extractall(data_dir)

In [10]:
import pandas as pd

df = pd.read_csv(data_dir+'/data/train.csv')

In [11]:
df = df.drop(['id'], axis=1)

In [17]:
from datasets import Dataset 

dataset = Dataset.from_pandas(df.sample(15000))

In [13]:
from transformers import AutoTokenizer

model_name = 't5-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 1.39MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
def convert_to_features(batch):
    input_encodings = tokenizer(batch['article'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(batch['highlights'], max_length=256, truncation=True)
        
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [19]:
dataset_pt = dataset.map(convert_to_features, batched=True)
dataset_pt.save_to_disk(data_dir+'/train_dataset')

Map: 100%|██████████| 15000/15000 [00:16<00:00, 895.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15000/15000 [00:00<00:00, 125040.61 examples/s]


In [16]:
df_val = pd.read_csv(data_dir+'/data/validation.csv')
dataset_val = Dataset.from_pandas(df_val)

dataset_val_pt = dataset_val.map(convert_to_features, batched=True)
dataset_val_pt.save_to_disk(data_dir+'/validation_dataset')

Map: 100%|██████████| 13368/13368 [00:12<00:00, 1043.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13368/13368 [00:00<00:00, 100249.16 examples/s]


In [20]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
import torch

In [23]:
os.makedirs('../artifacts/model_trainer/', exist_ok=True)

In [24]:
device = "cuda" if torch.cuda.is_available() else 'cpu'

model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_name)
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model_t5)

trainer_args = TrainingArguments(
    output_dir='../artifacts'+'/model_trainer', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1, weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

trainer = Trainer(model=model_t5.to(device), args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_pt,
                  eval_dataset=dataset_val_pt)

In [22]:
trainer.train()

  0%|          | 0/937 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
model_t5.save_pretrained('../artifacts/model_trainer/model_t5')
tokenizer.save_pretrained('../artifacts/model_trainer/tokenizer_t5')

In [None]:
import pickle

with open("../artifacts/model_trainer/model", 'wb') as file:
    pickle.dump(model_t5, file)

In [None]:
with open('../artifacts/model_trainer/tokenizer', 'wb') as file:
    pickle.dump(tokenizer, file)