# Import libraries

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate 

from sklearn.model_selection import train_test_split

from datasets import Dataset

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report, f1_score


# Loading Data

In [2]:
# Extracting only the subject heading and email body. This avoids unnessary text in the email like the sender's email address, date of the email etc.'

def extract_email(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    try:
        root_div = soup.body.find("div")
        inner_divs = root_div.find_all("div", recursive=False)

        # Exact match to /html/body/div/div[1]/p
        subject_p = inner_divs[0].find("p", recursive=False)
        subject = subject_p.get_text(" ", strip=True) if subject_p else ""

        # Exact match to /html/body/div/div[3]
        body_div = inner_divs[2]
        body = body_div.get_text(" ", strip=True)

    except (AttributeError, IndexError):
        subject = ""
        body = ""

    email = f"{subject} {body}".strip()
    return email

In [3]:
# Training set
train_dir = r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email classifier\train"

rows = []

for file in os.listdir(train_dir):
    if file.endswith(".html"):
        path = os.path.join(train_dir, file)
        email_text = extract_email(path)

        rows.append({
            "filename": file,
            "email": email_text
        })

train_df = pd.DataFrame(rows)
train_df.head()


Unnamed: 0,filename,email
0,email_1.html,Account Transfer Request Account Transfer Dear...
1,email_10.html,Job Application Response Employment Dear Appli...
2,email_11.html,System Maintenance Notice System Notice Dear C...
3,email_12.html,Pet Insurance Claim Pet Insurance Dear Pet Ins...
4,email_13.html,Asset Allocation Review Asset Allocation Dear ...


In [4]:
train_df.shape

(44, 2)

In [5]:
# Test set
test_dir = r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email classifier\test"

rows = []

for file in os.listdir(test_dir):
    if file.endswith(".html"):
        path = os.path.join(test_dir, file)
        email_text = extract_email(path)

        rows.append({
            "filename": file,
            "email": email_text
        })

test_df = pd.DataFrame(rows)
test_df.head()


Unnamed: 0,filename,email
0,email_1.html,Financial Education Workshop Education Event D...
1,email_10.html,Portfolio Review Request Portfolio Review Hell...
2,email_11.html,Personal Loan Application Personal Loan Dear L...
3,email_12.html,Account Fee Inquiry Fee Inquiry Dear Customer ...
4,email_2.html,Investment Performance Report Performance Repo...


In [6]:
test_df.shape

(12, 2)

In [7]:
# Train labels
train_labels = pd.read_csv(r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email classifier\train_labels.csv")
train_labels.head()

Unnamed: 0,filename,true_category
0,email_1.html,Account Management
1,email_2.html,Insurance Claims
2,email_3.html,Account Management
3,email_4.html,Investment Advisory
4,email_5.html,Investment Advisory


In [8]:
# merging training set with category labels
train_df = pd.merge(train_df, train_labels, on="filename")
train_df.head()

Unnamed: 0,filename,email,true_category
0,email_1.html,Account Transfer Request Account Transfer Dear...,Account Management
1,email_10.html,Job Application Response Employment Dear Appli...,Other
2,email_11.html,System Maintenance Notice System Notice Dear C...,Other
3,email_12.html,Pet Insurance Claim Pet Insurance Dear Pet Ins...,Insurance Claims
4,email_13.html,Asset Allocation Review Asset Allocation Dear ...,Investment Advisory


In [9]:
train_df["true_category"].unique()

array(['Account Management', 'Other', 'Insurance Claims',
       'Investment Advisory', 'Loan Processing'], dtype=object)

In [10]:
# merging test set with category labels
test_df = pd.merge(test_df, train_labels, on="filename")
test_df.head()

Unnamed: 0,filename,email,true_category
0,email_1.html,Financial Education Workshop Education Event D...,Account Management
1,email_10.html,Portfolio Review Request Portfolio Review Hell...,Other
2,email_11.html,Personal Loan Application Personal Loan Dear L...,Other
3,email_12.html,Account Fee Inquiry Fee Inquiry Dear Customer ...,Insurance Claims
4,email_2.html,Investment Performance Report Performance Repo...,Insurance Claims


In [11]:
test_df["true_category"].unique()

array(['Account Management', 'Other', 'Insurance Claims',
       'Investment Advisory'], dtype=object)

Note: Test data doesn't have Investment Advisory examples in it. I will redo the train/test train_test_split

In [12]:
combined_df = pd.concat([train_df, test_df]) # combining train and test df
combined_df.shape 

(56, 3)

In [15]:
le = LabelEncoder()

combined_df["label_id"] = le.fit_transform(combined_df["true_category"])
combined_df.head()

Unnamed: 0,filename,email,true_category,label_id
0,email_1.html,Account Transfer Request Account Transfer Dear...,Account Management,0
1,email_10.html,Job Application Response Employment Dear Appli...,Other,4
2,email_11.html,System Maintenance Notice System Notice Dear C...,Other,4
3,email_12.html,Pet Insurance Claim Pet Insurance Dear Pet Ins...,Insurance Claims,1
4,email_13.html,Asset Allocation Review Asset Allocation Dear ...,Investment Advisory,2


In [107]:
combined_df.to_csv("Email_Data.csv")

## Train/Test Split

In [3]:
df= pd.read_csv(r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email_Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,filename,email,true_category,label_id
0,0,email_1.html,Account Transfer Request Account Transfer Dear...,Account Management,0
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4
2,2,email_11.html,System Maintenance Notice System Notice Dear C...,Other,4
3,3,email_12.html,Pet Insurance Claim Pet Insurance Dear Pet Ins...,Insurance Claims,1
4,4,email_13.html,Asset Allocation Review Asset Allocation Dear ...,Investment Advisory,2


In [6]:
df['true_category'].value_counts()

true_category
Account Management     18
Investment Advisory    15
Insurance Claims        9
Other                   8
Loan Processing         6
Name: count, dtype: int64

In [None]:
df.isna().sum() # no null values

Unnamed: 0       0
filename         0
email            0
true_category    0
label_id         0
dtype: int64

In [None]:
df.duplicated().any() # Check if there are any duplicate rows in the entire DataFrame


False

In [7]:
df = df.rename(columns={'label_id': 'labels'})

In [19]:
# df = df[['email', 'true_category', 'labels']]
df = df[['email', 'labels']]

train_dataset, test_dataset = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['labels']  # preserves class distribution
)

print(train_dataset.head())
print(test_dataset.head())


                                                email  labels
18  Regulatory Update Compliance Notice Dear Clien...       4
50  Account Freeze Request Account Freeze Dear Sec...       2
22  Account Statement Dispute Statement Dispute De...       0
53  Direct Deposit Setup Direct Deposit Hello, I w...       2
0   Account Transfer Request Account Transfer Dear...       0
                                                email  labels
40  Account Statement Request Statement Request De...       0
8   Insurance Claim Submission Insurance Claim To ...       1
17  Umbrella Insurance Claim Umbrella Insurance De...       1
20  Wire Transfer Request Wire Transfer Dear Wire ...       0
12  Monthly Newsletter Newsletter Dear Valued Clie...       4


# BERT

- https://www.youtube.com/watch?v=EAIil0wD-1A
- https://medium.com/@prabhatzade/freezing-layers-and-fine-tuning-transformer-models-in-pytorch-a-simple-guide-119cad0980c6

## Orignal Data

### Fitting model

In [20]:
# Load tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [21]:
# converting to hugging face dataset
train_dataset = Dataset.from_pandas(train_dataset)
eval_dataset = Dataset.from_pandas(test_dataset)
 
# BERT can’t understand raw text — it only understands numbers. Tokenization is the step that converts human language into numerical inputs that the model can actually process.
def preprocess_function(examples):
    # Tokenize emails
    encodings = tokenizer(examples["email"], truncation=True, padding="max_length")
    # Add labels
    encodings["labels"] = examples["labels"]
    return encodings

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [None]:
train_dataset

Dataset({
    features: ['email', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 44
})

In [None]:
num_labels = 5 # 5 categories

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

for param in model.base_model.parameters():
    param.requires_grad = False

# unfreezing last 4 layers
for name, param in model.bert.named_parameters():
    if any(f"encoder.layer.{i}" in name for i in range(8, 12)) or "pooler" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Only the top four encoder layers and the pooler layer were unfrozen, allowing the model to adapt higher-level semantic representations to the email classification task while preserving general language knowledge learned during pretraining

In [90]:
# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [94]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # The model is evaluated at the end of each epoch
    save_strategy="epoch",  # A model checkpoint is saved at the end of each epoch. 
    logging_strategy="epoch",  # Training logs (loss, metrics) are written once per epoch.
    logging_steps=10,                   # Logs every 10 steps
    learning_rate=5e-5, # Controls how much the model updates its weights during training.
    per_device_train_batch_size=16, # Number of training samples processed at once on each device (CPU/GPU).
    per_device_eval_batch_size=16, # Same idea as training batch size, but for evaluation.
    num_train_epochs=3, # The dataset is passed through 3 times.
    weight_decay=0.01, # This is regularisation. It slightly penalises large weights to: reduce overfitting and improve generalisation
    load_best_model_at_end=True, # best checkpoint is automatically restored
    metric_for_best_model="accuracy", # Accuracy is used to decide which checkpoint is “best”
    report_to="none" # Disables external logging tools 
)


In [95]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [96]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.6912,1.576397,0.166667
2,1.5622,1.488038,0.333333
3,1.4605,1.457734,0.333333




TrainOutput(global_step=9, training_loss=1.5712886386447482, metrics={'train_runtime': 298.5252, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.03, 'total_flos': 34731594805248.0, 'train_loss': 1.5712886386447482, 'epoch': 3.0})

### Testing model performace

In [107]:
data = pd.read_csv(r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email_Data.csv")
data[["true_category", "label_id"]].drop_duplicates()

Unnamed: 0,true_category,label_id
0,Account Management,0
1,Other,4
3,Insurance Claims,1
4,Investment Advisory,2
16,Loan Processing,3


In [108]:
# maps label to category
label_map = dict(zip(data["label_id"].unique(), data["true_category"].unique()))

In [109]:
# import torch

# def classify(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
#     inputs={k:v.to(model.device) for k,v in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)
#     predicted_class_id = outputs.logits.argmax(dim=-1).item()
#     # return label_map(predicted_class_id)
#     return (predicted_class_id)


import torch
import torch.nn.functional as F

def classify(text):
    # Tokenize input
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=256
    )
    # Move inputs to the model device
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # shape: [1, num_classes]

    # Convert logits to probabilities
    probs = F.softmax(logits, dim=-1)  # shape: [1, num_classes]

    # Get predicted class and confidence
    predicted_class_id = probs.argmax(dim=-1).item()
    confidence = probs[0, predicted_class_id].item()

    # Map to human-readable label
    label = label_map[predicted_class_id]

    return label, confidence



In [None]:
print(classify("Good day. I'm just following up on my query about my investment.")) 

2


In [None]:
print(classify("Good day. I've attached the documents required for my Loan approval.")) 

0


In [None]:
print(classify("Loan processing.")) 

4


In [None]:
bert_eval_dataset = test_dataset.copy()  

# Apply classifier
results = bert_eval_dataset["email"].apply(classify)  
bert_eval_dataset["predicted_category"] = results.apply(lambda x: x[0])
bert_eval_dataset["confidence_score"] = results.apply(lambda x: x[1])

bert_eval_dataset


Unnamed: 0.1,Unnamed: 0,filename,email,true_category,labels,predicted_category,confidence_score
40,40,email_6.html,Account Statement Request Statement Request De...,Account Management,0,Account Management,0.291639
8,8,email_17.html,Insurance Claim Submission Insurance Claim To ...,Insurance Claims,1,Account Management,0.252583
17,17,email_25.html,Umbrella Insurance Claim Umbrella Insurance De...,Insurance Claims,1,Account Management,0.27314
20,20,email_28.html,Wire Transfer Request Wire Transfer Dear Wire ...,Account Management,0,Account Management,0.321209
12,12,email_20.html,Monthly Newsletter Newsletter Dear Valued Clie...,Other,4,Account Management,0.275104
41,41,email_7.html,Portfolio Rebalancing Portfolio Rebalancing De...,Investment Advisory,2,Account Management,0.279142
49,5,email_3.html,Retirement Planning Consultation Retirement Pl...,Account Management,0,Account Management,0.270023
26,26,email_33.html,Risk Assessment Update Risk Assessment Dear In...,Investment Advisory,2,Account Management,0.272211
36,36,email_42.html,Credit Card Application Credit Card Dear Credi...,Loan Processing,3,Account Management,0.299224
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4,Account Management,0.29181


In [114]:
labels = [
    "Account Management",
    "Insurance Claims",
    "Investment Advisory",
    "Loan Processing",
    "Other"
]

cm = confusion_matrix(
    bert_eval_dataset["true_category"],
    bert_eval_dataset["predicted_category"],
    labels=labels
)

cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df

Unnamed: 0,Account Management,Insurance Claims,Investment Advisory,Loan Processing,Other
Account Management,4,0,0,0,0
Insurance Claims,2,0,0,0,0
Investment Advisory,3,0,0,0,0
Loan Processing,1,0,0,0,0
Other,2,0,0,0,0


In [115]:
print(
    classification_report(
        bert_eval_dataset["true_category"],
        bert_eval_dataset["predicted_category"]
    )
)


                     precision    recall  f1-score   support

 Account Management       0.33      1.00      0.50         4
   Insurance Claims       0.00      0.00      0.00         2
Investment Advisory       0.00      0.00      0.00         3
    Loan Processing       0.00      0.00      0.00         1
              Other       0.00      0.00      0.00         2

           accuracy                           0.33        12
          macro avg       0.07      0.20      0.10        12
       weighted avg       0.11      0.33      0.17        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The model is heavily biased toward the majority or most dominant pattern in the training data and did not generalise well.

The original training dataset was small and imbalanced.

BERT requires sufficient and diverse data to learn class-specific language patterns. Hence synthetic data will be created

## Creating synthetic data

### Fitting model

In [14]:
import pandas as pd
import random
from openai import OpenAI
from dotenv import load_dotenv
import os
from sklearn.preprocessing import LabelEncoder


In [4]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if not api_key:
    print("No API key was found")
else:
    print("API key found")

API key found


In [5]:
client = OpenAI()


In [79]:
print(df['true_category'].value_counts())


true_category
Account Management     18
Investment Advisory    15
Insurance Claims        9
Other                   8
Loan Processing         6
Name: count, dtype: int64


In [None]:
# creating synthetic data of 200 emails per category (1000 emails in total)
from openai import OpenAI 

client = OpenAI() 

CATEGORIES = [
    "Account Management",
    "Investment Advisory",
    "Loan Processing",
    "Insurance Claims",
    "Other"
]

def generate_email_text(category):
    prompt = f"""
Generate a short, realistic email related to {category}.
Keep it 1-3 sentences.
Do NOT include the category name in the text.
Make it varied and natural, as if a client wrote it to a company.
"""
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=60
    )
    return response.choices[0].message.content.strip()


# Generate 1000 rows
rows = []
for category in CATEGORIES:
    for _ in range(200):  # 200 emails per category
        email_text = generate_email_text(category)
        rows.append({"email": email_text, "category": category})
        time.sleep(0.3)   

df = pd.DataFrame(rows)
df.to_csv("synthetic_email_text_1000.csv", index=False)
print(df.head())


                                          email_text            category
0  Hi, I wanted to check if there’s an update on ...  Account Management
1  Hi, I wanted to check in on the status of my a...  Account Management
2  Hi, I wanted to check if there have been any u...  Account Management
3  Hi, I wanted to check if there are any updates...  Account Management
4  Hi, I wanted to check if there’s an update on ...  Account Management


In [13]:
df.shape

(1000, 2)

In [18]:
df = df.rename(columns={"email_text" : "email"})

In [21]:
df.to_csv("synthetic_email_text_1000.csv", index=False)


In [22]:
le = LabelEncoder()

df["label"] = le.fit_transform(df["category"])
df.head()

Unnamed: 0,email,category,label
0,"Hi, I wanted to check if there’s an update on ...",Account Management,0
1,"Hi, I wanted to check in on the status of my a...",Account Management,0
2,"Hi, I wanted to check if there have been any u...",Account Management,0
3,"Hi, I wanted to check if there are any updates...",Account Management,0
4,"Hi, I wanted to check if there’s an update on ...",Account Management,0


In [29]:
from sklearn.model_selection import train_test_split

df = df[['email', 'label']]

train_dataset, eval_dataset = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # preserves class distribution
)

print(train_dataset.head())
print(eval_dataset.head())


                                                 email  label
631  Hello, I wanted to check on the status of my c...      1
544  Hello, I wanted to check on the status of my l...      3
593  Hello, I wanted to check on the status of my l...      3
912  Hello, I wanted to check if you offer any disc...      4
861  Hello, I wanted to check if there are any upda...      4
                                                 email  label
367  Hello, I’m interested in reviewing my current ...      2
167  Hi, I wanted to check if there are any updates...      0
637  Hello, I wanted to check on the status of my c...      1
738  Dear Team, I wanted to check on the status of ...      1
587  Hello, I wanted to check on the status of my l...      3


In [26]:
# Load tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [31]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_dataset)
eval_dataset = Dataset.from_pandas(eval_dataset)

# def preprocess_function(examples):
#     return tokenizer(examples["email"], truncation=True, padding="max_length")
def preprocess_function(examples):
    # Tokenize emails
    encodings = tokenizer(examples["email"], truncation=True, padding="max_length")
    # Add labels
    encodings["label"] = examples["label"]
    return encodings



train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Load model
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


for param in model.base_model.parameters():
    param.requires_grad = False #768*4 params train 

# unfreezing last 4 layers
for name, param in model.bert.named_parameters():
    if any(f"encoder.layer.{i}" in name for i in range(8, 12)) or "pooler" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
import evaluate
import numpy as np

# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch", 
    logging_strategy="epoch",  
    logging_steps=10,                    
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)


In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0397,0.003232,1.0
2,0.0075,0.001469,1.0
3,0.0042,0.001212,1.0




TrainOutput(global_step=150, training_loss=0.017124083538850147, metrics={'train_runtime': 3399.2307, 'train_samples_per_second': 0.706, 'train_steps_per_second': 0.044, 'total_flos': 631483541913600.0, 'train_loss': 0.017124083538850147, 'epoch': 3.0})

In [23]:

df = pd.read_csv(r"C:\Users\Shivan\Downloads\synthetic_email_text_1000.csv")
df.head()
df[["category", "label"]].drop_duplicates()

Unnamed: 0,category,label
0,Account Management,0
200,Investment Advisory,2
400,Loan Processing,3
600,Insurance Claims,1
800,Other,4


In [24]:
label_map = dict(zip(df["label"].unique(), df["category"].unique()))

import torch
import torch.nn.functional as F

def classify(text):
    # Tokenize input
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=256
    )
    # Move inputs to the model device
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # shape: [1, num_classes]

    # Convert logits to probabilities
    probs = F.softmax(logits, dim=-1)  # shape: [1, num_classes]

    # Get predicted class and confidence
    predicted_class_id = probs.argmax(dim=-1).item()
    confidence = probs[0, predicted_class_id].item()

    # Map to human-readable label
    label = label_map[predicted_class_id]

    return label, confidence


In [68]:
print(classify("The stock market saw record gains today.")) 

('Investment Advisory', 0.9308914542198181)


In [69]:
print(classify("Good day. I'm just following up on my query about my investment.")) 

('Investment Advisory', 0.9912799000740051)


In [70]:
print(classify("Good day. I've attached the documents required for my Loan approval.")) 

('Loan Processing', 0.997502863407135)


In [71]:
print(classify("Loan processing.")) 

('Loan Processing', 0.9160423278808594)


In [72]:
print(classify("Good Day. The weather seems nice today")) 

('Other', 0.48585131764411926)


In [73]:
print(classify("Good Day. I wanted to look into investing into the stock market. Could we arrange a meeting, Kind Regards")) 

('Investment Advisory', 0.9940735697746277)


In [74]:
print(classify("Good Day. I would like to close my bank account. Kind Regards")) 

('Account Management', 0.9921267032623291)


In [75]:

print(classify("Good morning. I'd like to take a bond for a house.  Kind Regards")) 

('Loan Processing', 0.3889714479446411)


In [76]:
print(classify(
""" 
Hi there,

I’m following up on a few things and not really sure who handles what, so I hope this reaches the right team. I recently made some changes to my account and noticed that my monthly deductions look different, which might be linked to the investment portfolio we discussed earlier this year. At the same time, I’m still waiting for feedback on the financing option I applied for, as the repayment amount seems to affect my available balance.

I also submitted documentation last week after an incident involving my vehicle, and I was told it could impact my policy benefits or possibly my cash flow depending on how it’s processed. Since everything seems connected on the statement I received, I just want clarity on whether any of this is still pending or if further approval is needed on my side.

Please let me know if I should be speaking to one department or if this needs to be reviewed across multiple teams. Thanks in advance.
"""))

('Account Management', 0.9054928421974182)


In [28]:
# loading model form last checkpoint

from transformers import AutoModelForSequenceClassification

last_checkpoint = "./results\checkpoint-150"  

model = AutoModelForSequenceClassification.from_pretrained(
    last_checkpoint
)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(last_checkpoint)


  last_checkpoint = "./results\checkpoint-150"


### Testing model performace

In [20]:
test_dataset.shape

(12, 2)

In [None]:

bert_eval_dataset = test_dataset.copy()  

# Apply classifier
results = bert_eval_dataset["email"].apply(classify)
 
bert_eval_dataset["predicted_category"] = results.apply(lambda x: x[0])
bert_eval_dataset["confidence_score"] = results.apply(lambda x: x[1])

bert_eval_dataset


Unnamed: 0.1,Unnamed: 0,filename,email,true_category,labels,predicted_category,confidence_score
40,40,email_6.html,Account Statement Request Statement Request De...,Account Management,0,Account Management,0.994709
8,8,email_17.html,Insurance Claim Submission Insurance Claim To ...,Insurance Claims,1,Insurance Claims,0.99691
17,17,email_25.html,Umbrella Insurance Claim Umbrella Insurance De...,Insurance Claims,1,Insurance Claims,0.996233
20,20,email_28.html,Wire Transfer Request Wire Transfer Dear Wire ...,Account Management,0,Account Management,0.979418
12,12,email_20.html,Monthly Newsletter Newsletter Dear Valued Clie...,Other,4,Account Management,0.59272
41,41,email_7.html,Portfolio Rebalancing Portfolio Rebalancing De...,Investment Advisory,2,Investment Advisory,0.997597
49,5,email_3.html,Retirement Planning Consultation Retirement Pl...,Account Management,0,Investment Advisory,0.811022
26,26,email_33.html,Risk Assessment Update Risk Assessment Dear In...,Investment Advisory,2,Investment Advisory,0.994484
36,36,email_42.html,Credit Card Application Credit Card Dear Credi...,Loan Processing,3,Account Management,0.605143
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4,Loan Processing,0.89595


In [39]:
labels = [
    "Account Management",
    "Insurance Claims",
    "Investment Advisory",
    "Loan Processing",
    "Other"
]

cm = confusion_matrix(
    bert_eval_dataset["true_category"],
    bert_eval_dataset["predicted_category"],
    labels=labels
)

cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df

Unnamed: 0,Account Management,Insurance Claims,Investment Advisory,Loan Processing,Other
Account Management,3,0,1,0,0
Insurance Claims,0,2,0,0,0
Investment Advisory,0,0,3,0,0
Loan Processing,1,0,0,0,0
Other,1,0,0,1,0


The confusion matrix shows strong performance on well-defined categories like Insurance Claims and Investment Advisory, with perfect classification. However, the model struggles with ambiguous categories such as ‘Other’ and shows confusion between Loan Processing and Account Management. The model relies heavily on learned patterns from structured data and tends to force uncertain emails into operational classes, leading to confident but incorrect prediction. This is one reason I explored LLM-based classifiers, as they showed better handling of ambiguous and multi-intent emails

- https://towardsdatascience.com/micro-macro-weighted-averages-of-f1-score-clearly-explained-b603420b292f/

In [50]:
print(
    classification_report(
        bert_eval_dataset["true_category"],
        bert_eval_dataset["predicted_category"]
    )
)


                     precision    recall  f1-score   support

 Account Management       0.60      0.75      0.67         4
   Insurance Claims       1.00      1.00      1.00         2
Investment Advisory       0.75      1.00      0.86         3
    Loan Processing       0.00      0.00      0.00         1
              Other       0.00      0.00      0.00         2

           accuracy                           0.67        12
          macro avg       0.47      0.55      0.50        12
       weighted avg       0.55      0.67      0.60        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# LLMS

## OpenAI - gpt-4.1 (Paid)

https://www.youtube.com/watch?v=THsGizLHrTs

### Fitting model 

In [52]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from math import exp


In [53]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if not api_key:
    print("No API key was found")
else:
    print("API key found")

API key found


In [59]:
client = OpenAI()

FULL_LABEL_MAP = {
    "ACCOUNT": "Account Management",
    "INSURANCE": "Insurance Claims",
    "INVESTMENT": "Investment Advisory",
    "LOAN": "Loan Processing",
    "OTHER": "Other"
}

# Note:
# Multi-token labels can introduce variability in the continuation of tokens (e.g. Loan Approval, Loan Approved, Loan Processing), each with different probabilities. A single-token label avoids this issue.
# Since the model only needs to predict one token, there is less uncertainty
# Log probabilities more reliable.
    
LABEL_MAP = {
    "ACCOUNT": 0,
    "INSURANCE": 1,
    "INVESTMENT": 2,
    "LOAN": 3,
    "OTHER": 4
}

def create_email_prompt(email_text):
    return f"""
You are a classification system. Classify this email into ONE of the categories below.
Respond with ONLY the category name (exactly one word, no punctuation, no quotes):

Categories: ACCOUNT, INSURANCE, INVESTMENT, LOAN, OTHER

Email:
\"\"\"{email_text}\"\"\"
"""


def classify_email(email_text):
    prompt = create_email_prompt(email_text)
    
    response = client.chat.completions.create(
        model="gpt-4.1-2025-04-14", # Specifying the date because later models dont allow access to logprobs
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=3,    
        logprobs=True,
        top_logprobs=5
    )
    
    # Combine tokens back into a single string
    tokens = response.choices[0].logprobs.content
    predicted_label = "".join([t.token for t in tokens]).strip()
    
    # Take logprob as confidence
    confidence = exp(tokens[0].logprob) 

    return predicted_label, confidence


In [None]:

email = "The stock market saw record gains today."
result = classify_email(email)
print(result)



('INVESTMENT', 0.9999545100305701)


In [30]:
print(classify_email("Good day. I'm just following up on my query about my investment."))

('INVESTMENT', 1.0)


In [31]:
print(classify_email("Good day. I've attached the documents required for my Loan approval."))

('LOAN', 1.0)


In [32]:
print(classify_email("Loan processing.")) 

('LOAN', 1.0)


The results are very accurate. The confidence it determined by the logprobs - log probability. The model predicts an output and assigns a probabilty to it. The model will choose the best output (the one that the model is most confident about, or in other words, the output with the the highest logprob)


### Testing model performace

In [None]:

gpt_eval_dataset = test_dataset.copy()  

# Apply classifier
results = gpt_eval_dataset["email"].apply(classify_email)
 
gpt_eval_dataset["predicted_category"] = results.apply(lambda x: x[0])
gpt_eval_dataset["confidence_score"] = results.apply(lambda x: x[1])

gpt_eval_dataset['predicted_category'] = gpt_eval_dataset['predicted_category'].map(FULL_LABEL_MAP)

gpt_eval_dataset


Unnamed: 0.1,Unnamed: 0,filename,email,true_category,labels,predicted_category,confidence_score
40,40,email_6.html,Account Statement Request Statement Request De...,Account Management,0,Account Management,1.0
8,8,email_17.html,Insurance Claim Submission Insurance Claim To ...,Insurance Claims,1,Insurance Claims,1.0
17,17,email_25.html,Umbrella Insurance Claim Umbrella Insurance De...,Insurance Claims,1,Insurance Claims,1.0
20,20,email_28.html,Wire Transfer Request Wire Transfer Dear Wire ...,Account Management,0,Other,0.939913
12,12,email_20.html,Monthly Newsletter Newsletter Dear Valued Clie...,Other,4,Other,0.999998
41,41,email_7.html,Portfolio Rebalancing Portfolio Rebalancing De...,Investment Advisory,2,Investment Advisory,1.0
49,5,email_3.html,Retirement Planning Consultation Retirement Pl...,Account Management,0,Investment Advisory,1.0
26,26,email_33.html,Risk Assessment Update Risk Assessment Dear In...,Investment Advisory,2,Investment Advisory,1.0
36,36,email_42.html,Credit Card Application Credit Card Dear Credi...,Loan Processing,3,Account Management,0.999569
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4,Other,1.0


In [62]:
labels = [
    "Account Management",
    "Insurance Claims",
    "Investment Advisory",
    "Loan Processing",
    "Other"
]

cm = confusion_matrix(
    gpt_eval_dataset["true_category"],
    gpt_eval_dataset["predicted_category"],
    labels=labels
)

cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df

Unnamed: 0,Account Management,Insurance Claims,Investment Advisory,Loan Processing,Other
Account Management,2,0,1,0,1
Insurance Claims,0,2,0,0,0
Investment Advisory,0,0,3,0,0
Loan Processing,1,0,0,0,0
Other,0,0,0,0,2


In [63]:
print(
    classification_report(
        gpt_eval_dataset["true_category"],
        gpt_eval_dataset["predicted_category"]
    )
)


                     precision    recall  f1-score   support

 Account Management       0.67      0.50      0.57         4
   Insurance Claims       1.00      1.00      1.00         2
Investment Advisory       0.75      1.00      0.86         3
    Loan Processing       0.00      0.00      0.00         1
              Other       0.67      1.00      0.80         2

           accuracy                           0.75        12
          macro avg       0.62      0.70      0.65        12
       weighted avg       0.69      0.75      0.70        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The model performs strongly across most categories, with only a few misclassifications. 

It has good semantic understanding and good intent recognition.

Accuracy: 0.75 - The model predicts the correct category 75% of the time on this evaluation set.



Imagine the case where this email classifier application is being sold to client. Would you release the product and tell the client to top up their API cost in their Open AI account over time? Absolutely not. So then theres a few options:

1. We can bare the cost (it's not a lot)
2. Offer the app as a service and have them pay a monthly subscription fee to use the app. (We can pay for the OpenAI API usage with the motnhly fee)
3. Use an open source LLM. 

And that leads me to the next subsection. Using a free LLM will prevent us ,or the client, from topping up the API usage cost, whether we sell the app to them or have them pay a monthly subscription.


## Ollama - llama3.1:8b (free)

### Fitting model

In [None]:
import os
import requests
from dotenv import load_dotenv 
from IPython.display import Markdown, display
from openai import OpenAI 
import ollama
import pandas as pd
import requests

In [65]:
df = pd.read_csv(r"C:\Users\Shivan\Downloads\Email classifier-20260122T132946Z-1-001\Email_Data.csv") 
df.head()


Unnamed: 0.1,Unnamed: 0,filename,email,true_category,label_id
0,0,email_1.html,Account Transfer Request Account Transfer Dear...,Account Management,0
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4
2,2,email_11.html,System Maintenance Notice System Notice Dear C...,Other,4
3,3,email_12.html,Pet Insurance Claim Pet Insurance Dear Pet Ins...,Insurance Claims,1
4,4,email_13.html,Asset Allocation Review Asset Allocation Dear ...,Investment Advisory,2


In [66]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if not api_key:
    print("No API key was found") 
else:
    print("API key found")

API key found


In [67]:
from openai import OpenAI
from math import exp

client = OpenAI()

In [68]:
requests.get("http://localhost:11434").content

b'Ollama is running'

In [76]:
LABELS = [
    "ACCOUNT",
    "INSURANCE",
    "INVESTMENT",
    "LOAN",
    "OTHER"
]

FULL_LABEL_MAP = {
    "ACCOUNT": "Account Management",
    "INSURANCE": "Insurance Claims",
    "INVESTMENT": "Investment Advisory",
    "LOAN": "Loan Processing",
    "OTHER": "Other"
} 

def create_email_prompt(email_text):
    return f"""
You are a classification system. Classify this email into ONE of the categories below.
Respond with ONLY the category name (exactly one word, no punctuation, no quotes):

Categories: ACCOUNT, INSURANCE, INVESTMENT, LOAN, OTHER

Email:
\"\"\"{email_text}\"\"\"
""" 

def classify_email(email_text, runs=7):
    prompt = create_email_prompt(email_text)

    payload = {
        "model": "llama3.1:8b",
        "stream": False,   
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content": email_text}
        ],
        "options": {
            "temperature": 0,
            "num_predict": 3
        }
    }

    r = requests.post(
        "http://localhost:11434/api/chat",
        json=payload
    )

    data = r.json()
    label = data["message"]["content"].strip()

    if label not in LABELS:
        label = "OTHER"

    # Confindence Score
    votes = []
    
    for _ in range(runs):
        r = ollama.chat(
            model="llama3.1:8b",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": email_text}
            ],
            options={
                "temperature": 0.2, # allows the model to guess when unsure
                "top_p": 0.9
            }
        )
        votes.append(r["message"]["content"].strip())

    final_label = max(set(votes), key=votes.count)
    print(votes) # prints the classified category
    confidence = votes.count(final_label) / runs


    return FULL_LABEL_MAP[label], confidence


In [None]:
email = """
Hi, I want to check the balance on my account and update my personal details.
"""
label, confidence_score = classify_email(email)

print(label, confidence_score)


['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
ACCOUNT MANAGEMENT 1.0


In [50]:
email = """
Good day. I'm just following up on my query about my investment.
"""

label, confidence_score = classify_email(email)

print(label, confidence_score)


['INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT']
INVESTMENT ADVISORY 1.0


In [51]:
email = """
Good day. I've attached the documents required for my Loan approval.
"""

label, confidence_score = classify_email(email)

print(label, confidence_score)


['LOAN', 'LOAN', 'LOAN', 'LOAN', 'LOAN', 'LOAN', 'LOAN']
LOAN PROCESSING 1.0


In [64]:
email = """ 
Hi there,

I’m following up on a few things and not really sure who handles what, so I hope this reaches the right team. I recently made some changes to my account and noticed that my monthly deductions look different, which might be linked to the investment portfolio we discussed earlier this year. At the same time, I’m still waiting for feedback on the financing option I applied for, as the repayment amount seems to affect my available balance.

I also submitted documentation last week after an incident involving my vehicle, and I was told it could impact my policy benefits or possibly my cash flow depending on how it’s processed. Since everything seems connected on the statement I received, I just want clarity on whether any of this is still pending or if further approval is needed on my side.

Please let me know if I should be speaking to one department or if this needs to be reviewed across multiple teams. Thanks in advance.
"""

label, confidence_score = classify_email(email)

print(label, confidence_score)

['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
ACCOUNT MANAGEMENT 1.0


### Testing model performance

In [None]:
llama_eval_dataset = test_dataset.copy()  

# Apply classifier
results = llama_eval_dataset["email"].apply(classify_email)

llama_eval_dataset["predicted_category"] = results.apply(lambda x: x[0])
llama_eval_dataset["confidence_score"] = results.apply(lambda x: x[1])

llama_eval_dataset

['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
['INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE']
['INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE', 'INSURANCE']
['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
['OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'NEWSLETTER']
['INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT']
['INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT']
['INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT']
['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
['OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER']
['ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT', 'ACCOUNT']
['INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'INVESTMENT', 'IN

Unnamed: 0.1,Unnamed: 0,filename,email,true_category,labels,predicted_category,confidence_score
40,40,email_6.html,Account Statement Request Statement Request De...,Account Management,0,Account Management,1.0
8,8,email_17.html,Insurance Claim Submission Insurance Claim To ...,Insurance Claims,1,Insurance Claims,1.0
17,17,email_25.html,Umbrella Insurance Claim Umbrella Insurance De...,Insurance Claims,1,Insurance Claims,1.0
20,20,email_28.html,Wire Transfer Request Wire Transfer Dear Wire ...,Account Management,0,Account Management,1.0
12,12,email_20.html,Monthly Newsletter Newsletter Dear Valued Clie...,Other,4,Other,0.857143
41,41,email_7.html,Portfolio Rebalancing Portfolio Rebalancing De...,Investment Advisory,2,Investment Advisory,1.0
49,5,email_3.html,Retirement Planning Consultation Retirement Pl...,Account Management,0,Investment Advisory,1.0
26,26,email_33.html,Risk Assessment Update Risk Assessment Dear In...,Investment Advisory,2,Investment Advisory,1.0
36,36,email_42.html,Credit Card Application Credit Card Dear Credi...,Loan Processing,3,Account Management,1.0
1,1,email_10.html,Job Application Response Employment Dear Appli...,Other,4,Other,1.0


In [85]:
labels = [
    "Account Management",
    "Insurance Claims",
    "Investment Advisory",
    "Loan Processing",
    "Other"
]

cm = confusion_matrix(
    llama_eval_dataset["true_category"],
    llama_eval_dataset["predicted_category"],
    labels=labels
)

cm_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_df

Unnamed: 0,Account Management,Insurance Claims,Investment Advisory,Loan Processing,Other
Account Management,3,0,1,0,0
Insurance Claims,0,2,0,0,0
Investment Advisory,0,0,3,0,0
Loan Processing,1,0,0,0,0
Other,0,0,0,0,2


In [86]:
print(
    classification_report(
        llama_eval_dataset["true_category"],
        llama_eval_dataset["predicted_category"]
    )
)


                     precision    recall  f1-score   support

 Account Management       0.75      0.75      0.75         4
   Insurance Claims       1.00      1.00      1.00         2
Investment Advisory       0.75      1.00      0.86         3
    Loan Processing       0.00      0.00      0.00         1
              Other       1.00      1.00      1.00         2

           accuracy                           0.83        12
          macro avg       0.70      0.75      0.72        12
       weighted avg       0.77      0.83      0.80        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Conclusion

A BERT-based model was initially trained to classify emails into predefined categories. When trained on the original dataset, the model performed poorly on unseen data and did not generalise well due to the small and imbalanced nature of the dataset. To improve performance, synthetic data was generated, creating a balanced dataset with 200 emails per category. Retraining BERT on this data resulted in better accuracy; however, the model still struggled with certain categories.

Despite the improvements, BERT showed uneven performance across classes. The model also produced high-confidence incorrect predictions, particularly for less common or ambiguous email categories. This indicates that BERT remains sensitive to ambiguous data and does not generalise well to real-world email variation, especially when trained primarily on synthetic data.

To address these limitations, Large Language Models (LLMs) were evaluated. Two models were tested: OpenAI GPT-4.1 and Ollama LLaMA 3.1:8B. Both models performed better than BERT, especially when evaluated using macro F1-score, which was chosen because the evaluation dataset was imbalanced.

***Ollama (LLaMA 3.1:8B)***

- Achieved the highest macro F1-score

- Showed consistent performance across most categories

- Demonstrated strong understanding of email intent

- Free to use when self-hosted

***OpenAI GPT-4o***

- Performed well across categories

- Strong semantic understanding

- Requires paid API access

***BERT***

- Improved with synthetic data but still underperformed

- Lower macro F1-score shows difficulty handling minority classes

- Sensitive to clean, structured training data


***Final Conclusion***

Based on macro F1-score, LLaMA 3.1:8B was selected as the best-performing model. It provided more balanced and reliable classification across all email categories while remaining cost-effective. The results show that LLMs are well suited for email classification tasks with limited or imbalanced data, as they rely on pre-trained language understanding rather than large labelled datasets.

Although LLMs may still struggle with unclear or complex emails, BERT would face similar challenges while requiring much more training data to achieve comparable performance.