In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support





In [2]:
df = pd.read_csv("processed_4class_emails.csv")

df.head()


Unnamed: 0,text,new_label
0,"Go until jurong point, crazy.. Available only ...",normal
1,Ok lar... Joking wif u oni...,normal
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,normal
4,"Nah I don't think he goes to usf, he lives aro...",normal


In [3]:
label_mapping = {
    "spam": 0,
    "important": 1,
    "follow_up": 2,
    "normal": 3
}

df["label"] = df["new_label"].map(label_mapping)

df.head()


Unnamed: 0,text,new_label,label
0,"Go until jurong point, crazy.. Available only ...",normal,3
1,Ok lar... Joking wif u oni...,normal,3
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,0
3,U dun say so early hor... U c already then say...,normal,3
4,"Nah I don't think he goes to usf, he lives aro...",normal,3


In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)


In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [6]:
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": test_labels
})


In [7]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)




In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


In [11]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2491,0.091186,0.978814,0.977585,0.977455,0.978814
2,0.097,0.061725,0.988983,0.988133,0.987329,0.988983
3,0.031,0.048379,0.992373,0.992346,0.992352,0.992373


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


TrainOutput(global_step=1770, training_loss=0.10833165012510483, metrics={'train_runtime': 411.0202, 'train_samples_per_second': 34.422, 'train_steps_per_second': 4.306, 'total_flos': 871217408625984.0, 'train_loss': 0.10833165012510483, 'epoch': 3.0})

In [12]:
trainer.evaluate()


{'eval_loss': 0.04837877303361893,
 'eval_accuracy': 0.9923728813559322,
 'eval_f1': 0.9923464110149156,
 'eval_precision': 0.9923515441067483,
 'eval_recall': 0.9923728813559322,
 'eval_runtime': 8.0251,
 'eval_samples_per_second': 147.039,
 'eval_steps_per_second': 18.442,
 'epoch': 3.0}

In [13]:
model.save_pretrained("transformer_classifier")
tokenizer.save_pretrained("transformer_classifier")


('transformer_classifier\\tokenizer_config.json',
 'transformer_classifier\\special_tokens_map.json',
 'transformer_classifier\\vocab.txt',
 'transformer_classifier\\added_tokens.json',
 'transformer_classifier\\tokenizer.json')