In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
# !pip install huggingface_hub
# !pip install ipywidgets
from huggingface_hub import notebook_login
from huggingface_hub import login

Reference: https://huggingface.co/docs/transformers/tasks/summarization#load-billsum-dataset

In [None]:
news_dataset_cnndailymail = pd.read_csv( "/content/drive/MyDrive/cnn_dailymail/train.csv", nrows=80000)
news_dataset_cnndailymail.drop(columns=["id"], inplace=True )
news_dataset_cnndailymail.rename(columns={"highlights":"summary"},inplace=True)
# news_dataset_cnndailymail = news_dataset_cnndailymail[0:100]
rows_count = news_dataset_cnndailymail.shape[0]
news_dataset_cnndailymail

Unnamed: 0,article,summary
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...
79995,Tesco is at the centre of cruelty allegations ...,Customers can order the animals over the count...
79996,By . Daily Mail Reporter . PUBLISHED: . 10:42 ...,U.S-led peace talks falter after the deaths of...
79997,The chaotic scenes inside a New York subway st...,A passer-by captured cell phone footage of the...
79998,(GameTap.com) -- Everyone wants to be more phy...,Entire 'Wii Fit' setup doesn't take up much sp...


Converting Pandas Dataframe into hugging face dataset type


In [None]:
# !pip install pyarrow
!pip install datasets
# !pip install transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

### convert to Huggingface dataset
news_dataset_cnndailymail = Dataset(pa.Table.from_pandas(news_dataset_cnndailymail))
news_dataset_cnndailymail = news_dataset_cnndailymail.train_test_split(test_size=0.1)
news_dataset_cnndailymail

DatasetDict({
    train: Dataset({
        features: ['article', 'summary'],
        num_rows: 72000
    })
    test: Dataset({
        features: ['article', 'summary'],
        num_rows: 8000
    })
})

Tokenization

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-small')

prefix = "summarize: "

def preprocess_function( data ):
    inputs = [prefix + doc for doc in data["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=data["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_news_dataset_cnndailymail = news_dataset_cnndailymail.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/72000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dynamic Padding

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small", return_tensors="tf")

Metric

In [None]:
# !pip install -U git+https://github.com/huggingface/accelerate.git
from transformers import pipeline

# !pip install evaluate
# !pip install rouge_score
import evaluate
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Model Setup

In [None]:
# import tensorflow
# AUTO = tensorflow.data.experimental.AUTOTUNE

# # # Create strategy from tpu
# tpu = tensorflow.distribute.cluster_resolver.TPUClusterResolver()
# tensorflow.config.experimental_connect_to_cluster(tpu)
# tensorflow.tpu.experimental.initialize_tpu_system(tpu)
# strategy = tensorflow.distribute.experimental.TPUStrategy(tpu)

# BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
from transformers import create_optimizer, AdamWeightDecay
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
import tensorflow
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback

In [None]:
# with strategy.scope():
#     optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
#     model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
#     # Dataset setup for training
#     tf_train_set = model.prepare_tf_dataset( tokenized_news_dataset_cnndailymail["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator )
#     tf_test_set = model.prepare_tf_dataset( tokenized_news_dataset_cnndailymail["test"], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator )

#     model.compile(optimizer=optimizer)  # No loss argument!
# model.summary()

In [None]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")
BATCH_SIZE = 8
# Dataset setup for training
tf_train_set = model.prepare_tf_dataset( tokenized_news_dataset_cnndailymail["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator )
tf_test_set = model.prepare_tf_dataset( tokenized_news_dataset_cnndailymail["test"], shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator )

model.compile(optimizer=optimizer)  # No loss argument!
model.summary()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60506624 (230.81 MB)
Trainable params: 60506624 (230.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
       filepath="/content/drive/MyDrive/cnn_dailymail/finetune/best_model",
       monitor="val_loss",
       save_best_only=True,
       save_weights_only=False,
       mode="min",
       verbose=1,
       save_format="tf"  # Add this line
   )

In [None]:
N_STEPS = int( (rows_count*0.1))//BATCH_SIZE
EPOCHS = 10
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=0, mode='auto', min_delta=0.0001, cooldown=0,min_lr=0)
push_to_hub_callback = PushToHubCallback(output_dir="Fine_Tune_T5_Model_News_Summarization", tokenizer=tokenizer)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/Fine_Tune_T5_Model_News_Summarization is already a clone of https://huggingface.co/AvianSpread/Fine_Tune_T5_Model_News_Summarization. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
from tensorflow.keras.utils import custom_object_scope
def register_custom_objects():
    custom_objects = {"AdamWeightDecay": AdamWeightDecay}
    return custom_objects
  optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, steps_per_epoch=N_STEPS, epochs=EPOCHS,callbacks=[lr_reduce,checkpoint_callback])

Epoch 1/10
Epoch 1: val_loss improved from inf to 1.69848, saving model to /content/drive/MyDrive/cnn_dailymail/finetune/best_model
Epoch 2/10
Epoch 2: val_loss improved from 1.69848 to 1.69405, saving model to /content/drive/MyDrive/cnn_dailymail/finetune/best_model
Epoch 3/10
Epoch 3: val_loss improved from 1.69405 to 1.69368, saving model to /content/drive/MyDrive/cnn_dailymail/finetune/best_model
Epoch 4/10
Epoch 4: val_loss improved from 1.69368 to 1.68747, saving model to /content/drive/MyDrive/cnn_dailymail/finetune/best_model
Epoch 5/10
Epoch 5: val_loss did not improve from 1.68747
Epoch 6/10
  96/1000 [=>............................] - ETA: 11:40 - loss: 1.8379

KeyboardInterrupt: 

###