In [1]:
!pip install transformers
!pip install datasets
!pip install scikit-learn


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [2]:
import pandas as pd

data = {
    'Email Text': [
        "Hello, I want to know more about the features of Product A. Can you send me details?",
        "I need pricing information for Product B. Please share a quote.",
        "Can you provide specifications for Product C? I am interested in its capabilities.",
        "Does Product A come with a warranty? Please let me know.",
        "I'm considering buying Product B in bulk. Do you offer discounts?",
        "What are the delivery options for Product C?",
        "Can you compare Product A and B for me? I'm trying to decide which one to buy.",
        "I want to integrate Product C into our system. Does it support API access?"
    ],
    'Category': [
        "Product A",
        "Product B",
        "Product C",
        "Product A",
        "Product B",
        "Product C",
        "Product A",
        "Product C"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Email Text,Category
0,"Hello, I want to know more about the features ...",Product A
1,I need pricing information for Product B. Plea...,Product B
2,Can you provide specifications for Product C? ...,Product C
3,Does Product A come with a warranty? Please le...,Product A
4,I'm considering buying Product B in bulk. Do y...,Product B
5,What are the delivery options for Product C?,Product C
6,Can you compare Product A and B for me? I'm tr...,Product A
7,I want to integrate Product C into our system....,Product C


In [3]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

#encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['Category'])

#tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
import torch
from torch.utils.data import Dataset

class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', truncation=True,
                                max_length=self.max_len, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(self.labels[idx])
        }

dataset = EmailDataset(df['Email Text'].tolist(), df['label'].tolist(), tokenizer)


In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,1.0593


TrainOutput(global_step=10, training_loss=1.0592543601989746, metrics={'train_runtime': 91.8629, 'train_samples_per_second': 0.435, 'train_steps_per_second': 0.109, 'total_flos': 2631134177280.0, 'train_loss': 1.0592543601989746, 'epoch': 5.0})

In [7]:
def classify_email(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return le.inverse_transform([pred])[0]

# Test
classify_email("Can you tell me about Product B's features and pricing?")


'Product A'