# Fine-tune FLAN-T5 for News Data Categoization

_Note: This is referenced from HuggingFace notebooks

## 1. Setup Development Environment


In [None]:
# !pip install pytesseract transformers datasets evaluate rouge-score nltk tensorboard py7zr --upgrade
!pip install pytesseract transformers==4.28.1 datasets evaluate rouge-score nltk tensorboard py7zr



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
import datasets

## 2. Load and prepare news dataset




In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import random

In [None]:
path_prefix = "drive/MyDrive/646:IR/"
dataset_name = 'Lamp2U/'

llm_input = 'bm25_hnsw_llm.csv'


In [None]:
path = path_prefix + dataset_name

train_df = pd.read_csv(path+"train/"+llm_input, usecols=['text', 'label'])
validate_df = pd.read_csv(path+"vali/"+llm_input, usecols=['text', 'label'])

In [None]:
train_df.head()

Unnamed: 0,text,label
0,"the category for the article:""Though I might n...",business
1,"the category for the article:""(Apparently, Sha...",style & beauty
2,"the category for the article:""More from Vanity...",style & beauty
3,"the category for the article:""Look at an exclu...",entertainment
4,"the category for the article:""On Monday, The N...",style & beauty


In [None]:
categories = ['women', 'religion', 'politics', 'style & beauty', 'entertainment', 'culture & arts', 'sports', 'science & technology', 'travel', 'business', 'crime', 'education', 'healthy living', 'parents', 'food & drink']

train_dataset = Dataset.from_pandas(train_df)
validate_dataset = Dataset.from_pandas(validate_df)

dataset = DatasetDict({"train": train_dataset, "vali": validate_dataset})

In [None]:
dataset['train'][0]

{'text': 'the category for the article:"Though I might not subscribe to every prayer in the Siddur, I always use the synagogue time for my own prayer of thanks for being alive and the multitudes of blessings I enjoy.  I want to let the Lord know I haven\'t forgotten them.  Then, leaving the hall, the yarmulke still in place on my head, I head home feeling a little purer." is "religion", and the category for the article:"I expected him to do well and show me some of his work. The money could be renewed for the second semester and the next year of college. Though I imagined that Mel would want to maintain periodic contact -- at least it was what I hoped -- I was wrong." is "religion", and the category for the article:"Although my mother swept away any feeling for her native land, I saw my trips as partly for her, maybe an effort to reconnect her to a land that only I wanted her to reconnect with." is "travel", and the category for the article:"The three make a trip of atypical opera them

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["vali"]]).map(lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["vali"]]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Max source length: 489


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Max target length: 5


In [None]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## 3. Fine-tune and evaluate FLAN-T5



In [None]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("f1") #f1 or rouge

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments



# Hugging Face repository id
repository_id = f"flan-t5-base-bm25-bert-news-data-categorization"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,
    learning_rate=3e-4,
    num_train_epochs=2,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=1000,
    evaluation_strategy="no",
    save_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["vali"],
    compute_metrics=compute_metrics,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Shrutiya/flan-t5-base-bm25-bert-news-data-categorization into local empty directory.


In [None]:
# Start training
trainer.train()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/945M [00:00<?, ?B/s]

Upload file logs/events.out.tfevents.1702265228.53886e6e6f49.2416.4:   0%|          | 1.00/5.63k [00:00<?, ?B/…

To https://huggingface.co/Shrutiya/flan-t5-base-bm25-bert-news-data-categorization
   e0c9f75..b47dcb5  main -> main

   e0c9f75..b47dcb5  main -> main



'https://huggingface.co/Shrutiya/flan-t5-base-bm25-bert-news-data-categorization/commit/b47dcb5d696c2c7ef58c1800b49cfdf310350b20'

## 4. Run Inference and Classification Report

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Shrutiya/flan-t5-base-bm25-bert-news-data-categorization')
model = AutoModelForSeq2SeqLM.from_pretrained('Shrutiya/flan-t5-base-bm25-bert-news-data-categorization')
model.to('cuda')

tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
from tqdm.auto import tqdm

samples_number = len(dataset['vali'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
for i in range(samples_number):
  text = dataset['vali']['text'][i]
  inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(dataset['vali']['label'][i].strip())

  progress_bar.update(1)

  0%|          | 0/101 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import classification_report

report = classification_report(labels_list, predictions_list, zero_division=0)
print(report)

                precision    recall  f1-score   support

      business       0.00      0.00      0.00         0
         crime       0.00      0.00      0.00         1
     education       0.00      0.00      0.00         2
 entertainment       1.00      0.19      0.32        47
  food & drink       0.00      0.00      0.00         0
healthy living       0.00      0.00      0.00         1
         music       0.00      0.00      0.00         0
      politics       0.80      0.97      0.88        33
        sports       0.33      0.33      0.33         3
style & beauty       0.07      1.00      0.13         3
        travel       1.00      0.50      0.67         2
         women       0.00      0.00      0.00         9

      accuracy                           0.46       101
     macro avg       0.27      0.25      0.19       101
  weighted avg       0.76      0.46      0.46       101



In [None]:
#upload predicted outputs to drive
output = 'dev_outputs.json'
import json
f = open(path+'vali/'+output, 'r')
outputs = json.load(f)
for i in range(len(predictions_list)):
  outputs['golds'][i]['output'] = predictions_list[i]
f.close()

json_object = json.dumps(outputs, indent=4)

with open(path+'vali/dev_preds.json', "w") as outfile:
    outfile.write(json_object)

In [None]:
!git clone https://github.com/LaMP-Benchmark/LaMP.git

fatal: destination path 'LaMP' already exists and is not an empty directory.


In [None]:
#--golds_json 'drive/MyDrive/646:IR/Lamp2U/vali/dev_outputs.json' \
 #--preds_json 'drive/MyDrive/646:IR/Lamp2U/vali/dev_preds.json' \
 #--output_file 'drive/MyDrive/646:IR/Lamp2U/vali/dev_eval.json'
!python LaMP/eval/eval_task.py \
    --golds_json 'drive/MyDrive/646:IR/Lamp2U/ensemble/bm25_bert/dev101.json' \
    --preds_json 'drive/MyDrive/646:IR/Lamp2U/ensemble/bm25_bert/dev_preds.json'\
    --task_name "LaMP_2" \
    --output_file 'drive/MyDrive/646:IR/Lamp2U/ensemble/bm25_bert/dev_eval.json'

2023-12-11 03:36:16.907675: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 03:36:16.907729: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 03:36:16.912530: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
