### Load and Prepare the Dataset

In [1]:
import pandas as pd

In [2]:
# Sample data - Replace this with your actual dataset
data = {
    "Email Text": [
        "Hello, I want to know more about the features of Product A.",
        "I need pricing information for Product B.",
        "Can you provide specifications for Product C?",
        "Does Product A come with a warranty? Please let me know.",
        "I'm considering buying Product B in bulk. Do you offer discounts?",
        "What are the delivery options for Product C?",
        "Can you compare Product A and B for me? I'm trying to decide which one to buy.",
        "I want to integrate Product C into our system. Does it support API access?"
    ],
    "Category": [
        "Product A",
        "Product B",
        "Product C",
        "Product A",
        "Product B",
        "Product C",
        "Product A / Product B",
        "Product C"
    ]
}

In [3]:
# Load the dataset into a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Email Text,Category
0,"Hello, I want to know more about the features ...",Product A
1,I need pricing information for Product B.,Product B
2,Can you provide specifications for Product C?,Product C
3,Does Product A come with a warranty? Please le...,Product A
4,I'm considering buying Product B in bulk. Do y...,Product B


### Text Tokenization & Label Encoding

In [4]:
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

In [5]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the email text
def tokenize_function(examples):
    return tokenizer(examples['Email Text'], padding='max_length', truncation=True)

# Tokenize the text (convert to tokens)
df['tokens'] = df['Email Text'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=64))

# Encode the categories/labels into numerical values
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Category'])


### Model Development

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification  # No need to import AdamW from transformers anymore
from torch.optim import AdamW  # AdamW is now part of torch.optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.nn import CrossEntropyLoss  # Add this import
from torch.optim.lr_scheduler import ReduceLROnPlateau 

In [7]:
# Step 2.1: Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Email Text'].tolist(),
    df['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Step 2.2: Tokenize the texts (use the tokenizer you have initialized)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)

In [8]:
# Step 2.3: Define a custom Dataset class for the tokenized data
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Corrected: 'labels' instead of 'label'
        return item

    def __len__(self):
        return len(self.labels)


In [9]:
# Step 2.4: Create DataLoader for training and validation
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Step 2.5: Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Step 2.6: Set up the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Step 2.7: Initialize the learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

# Step 2.7: Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Model Evaluation 

In [11]:
# Step 3: Define your training and evaluation loop
def train_epoch(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    for batch in data_loader:
        optimizer.zero_grad()

        # Move batch to the correct device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=-1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_preds / total_preds
    return accuracy, avg_loss

def eval_epoch(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for batch in data_loader:
            # Move batch to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Track loss and accuracy
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_preds / total_preds
    return accuracy, avg_loss

# Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    
    # Training phase
    train_accuracy, train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}")
    
    # Validation phase
    val_accuracy, val_loss = eval_epoch(model, val_loader, loss_fn, device)
    print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")
    
    # Step the scheduler after the validation loss
    scheduler.step(val_loss)  # This adjusts the learning rate based on the validation loss


Epoch 1/10
Train Loss: 1.3981361389160156, Train Accuracy: 0.3333333333333333
Validation Loss: 1.1119264364242554, Validation Accuracy: 0.5
Epoch 2/10
Train Loss: 1.2753655910491943, Train Accuracy: 0.3333333333333333
Validation Loss: 1.0803346633911133, Validation Accuracy: 0.5
Epoch 3/10
Train Loss: 1.1570532321929932, Train Accuracy: 0.6666666666666666
Validation Loss: 1.0844029188156128, Validation Accuracy: 0.5
Epoch 4/10
Train Loss: 1.0155190229415894, Train Accuracy: 0.8333333333333334
Validation Loss: 1.0556362867355347, Validation Accuracy: 0.5
Epoch 5/10
Train Loss: 0.8900400996208191, Train Accuracy: 0.8333333333333334
Validation Loss: 1.0285271406173706, Validation Accuracy: 0.5
Epoch 6/10
Train Loss: 0.9928539395332336, Train Accuracy: 0.6666666666666666
Validation Loss: 1.014777421951294, Validation Accuracy: 0.5
Epoch 7/10
Train Loss: 0.9684343338012695, Train Accuracy: 0.6666666666666666
Validation Loss: 1.0097520351409912, Validation Accuracy: 0.5
Epoch 8/10
Train Loss

# Script to predict the category 

In [16]:
# Function for making predictions on new email text
def predict(model, text, tokenizer, device):
    model.eval()  # Set the model to evaluation mode
    # Tokenize the input email text and convert it to tensors
    encodings = tokenizer(text, truncation=True, padding=True, max_length=64, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Make predictions without tracking gradients
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        
    return preds.item()

# Example usage to accept user input
new_email = input("Enter the email text: ")  # Accepting user input for the email text

# Assuming 'model', 'tokenizer', and 'device' are already initialized as shown in previous code
category = predict(model, new_email, tokenizer, device)

# Inverse transform to get the original category label
predicted_category = label_encoder.inverse_transform([category])
print(f"Predicted category: {predicted_category[0]}")


Enter the email text:  Can you provide specifications for Product A?


Predicted category: Product C


### Saving the model 

In [13]:
# Save the model
model.save_pretrained('fine_tuned_bert')
tokenizer.save_pretrained('fine_tuned_bert')

# Load the model later for inference
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert')
tokenizer = BertTokenizer.from_pretrained('fine_tuned_bert')
