In [1]:
!nvidia-smi

Tue Apr  9 22:06:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-12GB           Off | 00000000:03:00.0 Off |                    0 |
| N/A   34C    P0              27W / 250W |      0MiB / 12288MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
! pip install datasets transformers rouge-score nltk py7zr

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpranaysaggar[0m ([33mpranay-saggar[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
%env WANDB_PROJECT=bart_large_xsum_samsum

env: WANDB_PROJECT=bart_large_xsum_samsum


In [4]:
model_checkpoint = "google/flan-t5-base"

## Loading the dataset

In [5]:
from datasets import *
from sklearn.model_selection import train_test_split
ds = load_dataset("skeskinen/TinyStories-GPT4")
metric = load_metric("rouge")
train_testvalid = ds['train'].select(range(5000)).train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


## Preprocessing the data

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["story"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## Fine-tuning the model

In [9]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [10]:
batch_size = 4
args = Seq2SeqTrainingArguments(
    "test-story-summarization",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [13]:
for i in dataset:
    print(i)

train
test
validation


In [14]:

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/saggar.p/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.2719,1.024203,49.0331,26.9531,41.8713,42.5925,18.938
2,1.1294,0.987495,49.5681,27.6753,42.4194,43.1537,18.948
3,1.0882,0.978757,49.9123,28.0159,42.6969,43.3514,18.946




TrainOutput(global_step=1500, training_loss=1.16317822265625, metrics={'train_runtime': 3430.0994, 'train_samples_per_second': 3.498, 'train_steps_per_second': 0.437, 'total_flos': 4718095851036672.0, 'train_loss': 1.16317822265625, 'epoch': 3.0})

In [17]:
trainer.evaluate()





{'eval_loss': 0.9787572026252747,
 'eval_rouge1': 49.9123,
 'eval_rouge2': 28.0159,
 'eval_rougeL': 42.6969,
 'eval_rougeLsum': 43.3514,
 'eval_gen_len': 18.946,
 'eval_runtime': 239.7256,
 'eval_samples_per_second': 2.086,
 'eval_steps_per_second': 0.521,
 'epoch': 3.0}

In [18]:

wandb.finish()

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/gen_len,▁█▇▇
eval/loss,█▂▁▁
eval/rouge1,▁▅██
eval/rouge2,▁▆██
eval/rougeL,▁▆██
eval/rougeLsum,▁▆██
eval/runtime,█▁▂▁
eval/samples_per_second,▁█▇█
eval/steps_per_second,▁█▇▇
train/epoch,▁▁▅▅████

0,1
eval/gen_len,18.946
eval/loss,0.97876
eval/rouge1,49.9123
eval/rouge2,28.0159
eval/rougeL,42.6969
eval/rougeLsum,43.3514
eval/runtime,239.7256
eval/samples_per_second,2.086
eval/steps_per_second,0.521
train/epoch,3.0


[Uploaded the model](https://huggingface.co/transformers/model_sharing.html) to the [🤗 Model Hub](https://huggingface.co/models). You can use it to generate results as shown below.

In [37]:
from transformers import AutoModelForSeq2SeqLM
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:

model.save_pretrained("./model")  # Save model files to a local directory

# Load the saved model
model = AutoModelForSeq2SeqLM.from_pretrained("./model")
# Push the model to the Hugging Face Model Hub
model.push_to_hub("pranaysaggar/flan_t-5_story_summarizer")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pranaysaggar/flan_t-5_story_summarizer/commit/f8eca38610d78995aa0ae0f6ddb16740a2513a45', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='f8eca38610d78995aa0ae0f6ddb16740a2513a45', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
tokenizer.push_to_hub("pranaysaggar/flan_t-5_story_summarizer")

CommitInfo(commit_url='https://huggingface.co/pranaysaggar/flan_t-5_story_summarizer/commit/f35eee331bd54b698a9276211c555d623f139387', commit_message='Upload tokenizer', commit_description='', oid='f35eee331bd54b698a9276211c555d623f139387', pr_url=None, pr_revision=None, pr_num=None)

In [42]:
from transformers import pipeline

summarizer = pipeline("summarization", model="pranaysaggar/flan_t-5_story_summarizer")
conversation = '''In the heart of a bustling city, nestled between towering skyscrapers and bustling streets, there stood a small, forgotten bookstore. Its weathered sign creaked in the wind, bearing the name "Whispering Pages." Inside, the shelves were lined with books of every genre, their spines bearing the weight of countless tales waiting to be discovered.

Amidst the chaos of modern life, a young woman named Emily found solace within the walls of Whispering Pages. Every evening, she would slip away from the noise and lose herself in the magic of storytelling. From epic adventures to tender romances, each book offered her an escape, a chance to wander through worlds far beyond her own.

One fateful day, Emily stumbled upon an old, leather-bound tome hidden in the dusty corner of the store. Its pages were worn and yellowed with age, yet its words beckoned her with an irresistible allure. As she flipped through the brittle pages, she felt a whisper of something ancient and mysterious, as if the book held secrets waiting to be unveiled. And so, with trembling hands, Emily embarked on a journey unlike any other, guided by the power of imagination and the promise of adventure.
'''
print(summarizer(conversation))

[{'summary_text': 'A young woman named Emily finds solace within the walls of Whispering Pages, a forgotten bookstore in a bustling city, with the magic of storytelling and the promise of adventure.'}]
