<a href="https://colab.research.google.com/github/S-Delowar/LLM-Email-Subjector/blob/main/fine_tune_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets transformers evaluate huggingface_hub -q

In [3]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

import evaluate
from huggingface_hub import notebook_login

## Load the processed Data

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load email_subect processed data
train_path = "/content/drive/MyDrive/email_subjectline/processed_data/email_subjectline_train.csv"
val_path = "/content/drive/MyDrive/email_subjectline/processed_data/email_subjectline_val.csv"
test_path = "/content/drive/MyDrive/email_subjectline/processed_data/email_subjectline_test.csv"

In [8]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

## Convert to HuggingFce datasets


In [9]:
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df),
    }
)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 12794
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 1734
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 1718
    })
})

In [12]:
# Sample
dataset["train"][10]

{'input': "Write a professional subject for this mail:\n\nWill,   Here is a list of the top items we need to work on to improve the position  and p&l reporting for the west desk.\nMy underlying goal is to create position managers and p&l reports that  represent all the risk held by the desk and estimate p&l with great accuracy.\nLet's try and schedule a meeting for this Wednesday to go over the items  above.\nPhillip\n",
 'output': 'Priority List'}

## Load LLM Model and Tokenizer

In [14]:
# Load model & tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## Tokenization

In [15]:
# Function for Tokenization
def tokenize_function(example):

  model_inputs = tokenizer(example["input"], max_length=512, truncation=True)

  with tokenizer.as_target_tokenizer():
      labels = tokenizer(example["output"], max_length=32, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input", "output"])


In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12794
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1734
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1718
    })
})

In [20]:
# Sample
tokenized_dataset["train"]["labels"][300]

[180, 2610, 8, 3195, 18, 22098, 3387, 19035, 24995, 1]

## Model Trainer

In [21]:
# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [22]:
!pip install rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
from evaluate import load

# Load ROUGE metric
rouge = load("rouge")

# Define compute metrics function
def compute_metrics(eval_pred):
  preds, labels = eval_pred

  preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Compute ROUGE
  return rouge.compute(predictions=decoded_preds, references=decoded_labels)

### Initialize Trainer

In [None]:
# Huggingface logging
notebook_login()

In [29]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-email-subjectline",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=6,
    weight_decay=0.01,
    fp16=False,
    predict_with_generate=True,
    push_to_hub=True,
    hub_model_id="sdelowar2/flan-t5-email-subjectline",
    hub_strategy="checkpoint",
    report_to="none"
)

In [30]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [28]:
# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.9923,3.692347,0.297746,0.150693,0.292687,0.29236
2,2.9033,3.677068,0.305713,0.15586,0.300848,0.300543
3,2.8519,3.682816,0.309912,0.159563,0.304376,0.304318


TrainOutput(global_step=4800, training_loss=2.931242955525716, metrics={'train_runtime': 1347.0868, 'train_samples_per_second': 28.493, 'train_steps_per_second': 3.563, 'total_flos': 5161217913176064.0, 'train_loss': 2.931242955525716, 'epoch': 3.0})

In [31]:
# Run the training for more 3 epochs
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
4,2.8339,3.663966,0.314346,0.161551,0.3093,0.309254
5,2.8067,3.657794,0.315205,0.162916,0.310012,0.30996
6,2.7319,3.666781,0.316504,0.163504,0.311467,0.311507


TrainOutput(global_step=9600, training_loss=1.3981473795572916, metrics={'train_runtime': 1401.4293, 'train_samples_per_second': 54.776, 'train_steps_per_second': 6.85, 'total_flos': 1.033028026042368e+16, 'train_loss': 1.3981473795572916, 'epoch': 6.0})

## Evaluation on Test Data

In [32]:
# Prediction on  test data
output = trainer.predict(tokenized_dataset["test"])

preds = output.predictions
labels = output.label_ids

preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Compute metrics on test data
test_metrics = rouge.compute(predictions=decoded_preds, references=decoded_labels)

In [33]:
print(test_metrics)

{'rouge1': np.float64(0.2967526156853725), 'rouge2': np.float64(0.15909880969804685), 'rougeL': np.float64(0.29210522986748555), 'rougeLsum': np.float64(0.2922569133859759)}


## Save test output

In [34]:
# Create dataframe for test data with actual and predicted subject
output_df = pd.DataFrame({
    "input": test_df["input"],
    "actual_subjectline": test_df["output"],
    "predicted_subjectline": decoded_preds
})

In [35]:
output_df.tail(5)

Unnamed: 0,input,actual_subjectline,predicted_subjectline
1713,Suggest a subject line for the email below:\n\...,insurance\n,Insurance
1714,What would be a good subject for this message?...,Reminder: Risk Management Simulation Questions\n,Swaps Question
1715,Generate a concise subject line for this email...,Non-Exempt Scorecard\n,Non-Exempt Employee Evaluation
1716,What would be a good subject for this message?...,Weekly Reports are Due\n,Weekly Update for Philippe and Greg
1717,What would be a good subject for this message?...,Power Pool\n,Power Pool


In [38]:
print(f"""Sample Input:\n=========\n{output_df["input"][30]}""")
print(f"""Actual Subject-Line:\n=========\n{output_df["actual_subjectline"][30]}""")
print(f"""Predicted Subject-Line:\n=========\n{output_df["predicted_subjectline"][30]}""")

Sample Input:
Suggest a subject line for the email below:

As required by the Houston Fire Department, a fire drill has been scheduled for the Enron Center Campus.
Enron Center North, 1400 Smith St., approximately  3:15 PM on Thursday, December 20th, 2001  	Enron Center South, 1500 Louisiana St., approximately 3:45 PM on Thursday, December 20th, 2001  Please advise all clients, contractors, and visitors that this will be a fire drill only.
The fire alarm will sound at 3:15 PM in Enron Center North and 3:45 PM in Enron Center South.
You will be asked to go to the stairwell and standby.
Do not go into the stairwell.
Further instructions will be given over the public address system.
If you experience any difficulties in hearing either the fire alarm or any announcements over the public address system, please notify the Facilities Help Desk by e-mail at facilitieshelpdesk1@enron.com.
Anyone that is mobility impaired or medically disabled may be excused from participating in this drill by s

In [39]:
# Define test output path
test_output_path = "/content/drive/MyDrive/email_subjectline/test_output"
os.makedirs(test_output_path, exist_ok=True)

# Save test output to csv
output_df.to_csv(os.path.join (test_output_path, "test_output.csv"), index=False)

## Push the Model to Hugging Face Hub

In [40]:
trainer.push_to_hub(commit_message="Pushed model after 6 epochs")

CommitInfo(commit_url='https://huggingface.co/sdelowar2/flan-t5-email-subjectline/commit/56e329f9ae55c90e1aa96674ce1d7b0a3c31c6e4', commit_message='Pushed model after 6 epochs', commit_description='', oid='56e329f9ae55c90e1aa96674ce1d7b0a3c31c6e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sdelowar2/flan-t5-email-subjectline', endpoint='https://huggingface.co', repo_type='model', repo_id='sdelowar2/flan-t5-email-subjectline'), pr_revision=None, pr_num=None)