# Fine-tune FLAN-T5 for News Data Categoization

_Note: This is referenced from the HuggingFace notebooks

## 1. Setup Development Environment


In [None]:
# !pip install pytesseract transformers datasets evaluate rouge-score nltk tensorboard py7zr --upgrade
!pip install pytesseract transformers==4.28.1 datasets evaluate rouge-score nltk tensorboard py7zr

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import dependencies

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

## 2. Load and prepare news dataset




In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict
import random

In [6]:
path_prefix = "drive/MyDrive/646:IR/"
dataset_name = 'Lamp4U/'

llm_input = 'bm25_hnsw_llm.csv'


In [7]:
path = path_prefix + dataset_name

train_df = pd.read_csv(path+"train/"+llm_input, usecols=['text', 'label'])
validate_df = pd.read_csv(path+"vali/"+llm_input, usecols=['text', 'label'])


In [8]:
train_df.head()

Unnamed: 0,text,label
0,"""Investing In a Larger Down Payment: High Yiel...",The Pre-Crisis HELOC Chickens Are Coming Home ...
1,"""Some Recent Letters on Downsizing and Mortgag...",Selling a House to Buy a House
2,"""Why and How to Eliminate Mortgage Charges by ...",The Tontine: A 17th Century Solution to a 21st...
3,"""National Park Views Not To Be Missed (PHOTOS)...",The 5 Best National Parks In The World
4,"""Can You Recognize Your City From Above?"" is t...",7 Incredible Infinity Pools (PHOTOS)


In [9]:

train_dataset = Dataset.from_pandas(train_df)
validate_dataset = Dataset.from_pandas(validate_df)

dataset = DatasetDict({"train": train_dataset, "vali": validate_dataset})

In [10]:
sample = dataset['train'][0]
print(f"text: \n{sample['text']}\n---------------")
print(f"title: \n{sample['label']}\n---------------")

text: 
"Investing In a Larger Down Payment: High Yields and No Risk" is the title for "Consumers looking to purchase a home within the near future face many decisions, including how large a down payment to make. The down payment is the sale price (confirmed by a appraisal) less the loan amount. In most cases, home purchasers must have financial assets at least as large as the down payment they make.", and "Some Recent Letters on Downsizing and Mortgage Lender Mistakes" is the title for "In pricing loans used to purchase a home, lenders distinguish three possible uses of the property. Mortgages used to purchase a house that the purchasers intend to occupy as their primary residence get the best price.", and "Do You Have a Simple Interest Mortgage?" is the title for "This is a good time, therefore, for borrowers to make sure that their mortgage has not been converted into a SIM, and if it has, to develop a plan for protecting themselves. It isn't all that difficult once you know the dril

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [12]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["vali"]]).map(lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["vali"]]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Max target length: 33


In [13]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## 3. Fine-tune and evaluate FLAN-T5



In [14]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge") #f1 or rouge

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [19]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



# Hugging Face repository id
repository_id = f"flan-t5-base-bm25-hnsw-bert-news-title-generation"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-4,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=1000,
    evaluation_strategy="no",
    save_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["vali"],
    compute_metrics=compute_metrics,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/flan-t5-base-bm25-hnsw-bert-news-title-generation is already a clone of https://huggingface.co/Shrutiya/flan-t5-base-bm25-hnsw-bert-news-title-generation. Make sure you pull the latest changes with `repo.git_pull()`.


In [20]:
# Start training
trainer.train()

# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
13,3.0079
26,1.8552
39,1.2389
52,0.9512
65,0.7787


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/945M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1702422894.727b03c6ddab.394.0:   0%|          | 1.00/6.09k [00:00<?, ?B/s…

To https://huggingface.co/Shrutiya/flan-t5-base-bm25-hnsw-bert-news-title-generation
   2852a08..309e625  main -> main

   2852a08..309e625  main -> main



'https://huggingface.co/Shrutiya/flan-t5-base-bm25-hnsw-bert-news-title-generation/commit/309e625dec411511efb7b41efef015f450f1ea25'

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

## 4. Run Inference and Classification Report

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Shrutiya/flan-t5-base-bm25-hnsw-bert-news-title-generation')
model = AutoModelForSeq2SeqLM.from_pretrained('Shrutiya/flan-t5-base-bm25-hnsw-bert-news-title-generation')
model.to('cuda')

In [22]:
from tqdm.auto import tqdm

samples_number = len(dataset['vali'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
for i in range(samples_number):
  text = dataset['vali']['text'][i]
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
  decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
  predictions_list.append(predicted_title)
  progress_bar.update(1)

  0%|          | 0/101 [00:00<?, ?it/s]

In [23]:
#upload predicted outputs to drive
output = 'dev_outputs.json'
import json
f = open(path+'vali/'+output, 'r')
outputs = json.load(f)
for i in range(len(predictions_list)):
  outputs['golds'][i]['output'] = predictions_list[i]
f.close()

json_object = json.dumps(outputs, indent=4)

with open(path+'vali/dev_preds.json', "w") as outfile:
    outfile.write(json_object)

In [24]:
!git clone https://github.com/LaMP-Benchmark/LaMP.git

Cloning into 'LaMP'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 42 (delta 9), reused 8 (delta 1), pack-reused 0[K
Receiving objects: 100% (42/42), 19.03 KiB | 2.72 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [25]:
# --golds_json 'drive/MyDrive/646:IR/Lamp4U/ensemble/bm25_hnsw_bert/dev-new101.json' \
# --preds_json 'drive/MyDrive/646:IR/Lamp4U/ensemble/bm25_hnsw_bert/dev_preds.json'\
# --task_name "LaMP_4" \
# --output_file 'drive/MyDrive/646:IR/Lamp4U/ensemble/bm25_hnsw_bert/dev_eval.json'

!python LaMP/eval/eval_task.py \
      --golds_json 'drive/MyDrive/646:IR/Lamp4U/vali/dev_outputs.json' \
      --preds_json 'drive/MyDrive/646:IR/Lamp4U/vali/dev_preds.json' \
      --task_name "LaMP_4"\
      --output_file 'drive/MyDrive/646:IR/Lamp4U/vali/dev_eval.json'


2023-12-12 23:44:05.902682: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-12 23:44:05.902744: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-12 23:44:05.902786: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
