# **Fine- Tuning LLMs for Email Classification**





Installing the required dependencies

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:0

In [2]:

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments,TFAutoModelForSequenceClassification
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')


Importing the created dataset and extracting the required Columns



In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/SmartSense_TA/smartSense_TA_UniEmailDataset - Sheet1.csv')

In [4]:
dataset = pd.DataFrame(dataset)
print(dataset.head())

   Email_ID   Sender_Name Sender_Type                    Subject  \
0         1      John Doe     Student       Request for Syllabus   
1         2    Jane Smith   Corporate   Internship Opportunities   
2         3       Dr. Liu  Researcher  Request for Research Data   
3         4  Mike Johnson   Corporate  Partnership Legal Matters   
4         5     Sarah Lee     Student     Course Material Access   

                                          Email_Body  \
0  "Can you please send me the syllabus for cours...   
1  "We are interested in offering internships to ...   
2  "I need access to shared research data on topi...   
3  "We need to discuss legal agreements regarding...   
4  "Where can I find the course material for clas...   

                           Category         Response_Action Sensitivity_Level  \
0                 Student Inquiries          Fetch syllabus               Low   
1               Corporate Inquiries         Escalate to HOD              High   
2  Academic

In [5]:
training_df = dataset[['Email_Body', 'Category']]
# print(training_df)

In [6]:
label_encoder = LabelEncoder() # Helps Convert categories into numerical categories,
training_df['label'] = label_encoder.fit_transform(training_df['Category'])

# Display the DataFrame with labels
# print(training_df)

In [7]:
# Step 3: Train-Test Split (80% train, 20% test)
train_df, test_df = train_test_split(training_df, test_size=0.2, stratify=training_df['label'], random_state=42)


Using OPT model since it is considered better for multi-class classifications

In [8]:
hf_train_dataset = Dataset.from_pandas(train_df[['Email_Body', 'label']])
hf_test_dataset = Dataset.from_pandas(test_df[['Email_Body', 'label']])

# Step 5: Load tokenizer and model
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess dataset - tokenization
def preprocess_function(examples):
    return tokenizer(examples["Email_Body"], padding="max_length", truncation=True)

# Tokenize the datasets
tokenized_train_dataset = hf_train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = hf_test_dataset.map(preprocess_function, batched=True)

# Step 6: Load pre-trained model (OPT-350M for classification) - Number of labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Step 7: Define a compute_metrics function to calculate accuracy
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Step 8: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Step 9: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,  # Use test set for evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add the accuracy metric
)

# Step 10: Train the model
trainer.train()

# Step 11: Save the model
model.save_pretrained("/content/drive/MyDrive/SmartSense_TA/opt_email_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/SmartSense_TA/opt_email_classifier")

# Step 12: Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)  # Print accuracy and other metrics

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9418,0.235731,0.916667,0.933333,0.916667,0.915344
2,0.1353,0.031685,1.0,1.0,1.0,1.0


{'eval_loss': 0.031685035675764084, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 4.5883, 'eval_samples_per_second': 5.231, 'eval_steps_per_second': 0.654, 'epoch': 2.0}


In [9]:
# Step 1: Load the fine-tuned model and tokenizer
model_name ="/content/drive/MyDrive/SmartSense_TA/opt_email_classifier"  # Path to the saved model directory
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Step 2: Set the model to evaluation mode
model.eval()

# Step 3: New email body to predict

def classify_email(new_email, model, tokenizer, label_encoder, max_length=128):
    """
    Classifies the given email into a category using the provided model and tokenizer.

    Parameters:
    - new_email (str): The email text to classify.
    - model: The trained model used for classification.
    - tokenizer: The tokenizer corresponding to the model.
    - label_encoder: The label encoder for converting labels back to categories.
    - max_length (int): The maximum length for tokenization.

    Returns:
    - str: The predicted category for the email.
    """

    # Step 4: Preprocess the new email
    inputs = tokenizer(new_email, padding="max_length", truncation=True, return_tensors="pt", max_length=max_length)

    # Step 5: Move inputs to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Step 6: Get the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Step 7: Get the predicted label
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    # Step 8: Convert predicted label back to category
    predicted_category = label_encoder.inverse_transform(predictions)[0]

    return predicted_category



In [13]:
# Example usage
new_email = "What is the syllabus for course X?"
predicted_category = classify_email(new_email, model, tokenizer, label_encoder)
print(f"The predicted category for the email is: {predicted_category}")

The predicted category for the email is: Student Inquiries
