In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, Pipeline, AutoModel, AutoModelForSequenceClassification, AutoTokenizer, pipeline
from sklearn.model_selection import train_test_split
import joblib

In [2]:
df = pd.read_csv("fraud_email_dataset.csv")
df = df.rename(columns={"Class" : "label", "Text" : "text"})
df

Unnamed: 0,text,label
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0
...,...,...
11924,Travel well. I'll look forward to hearing your...,0
11925,"Dear friend, I wish to begin by way of introdu...",1
11926,Follow Up Flag: Follow upFlag Status: FlaggedM...,0
11927,sbwhoeop B6Saturday January 23 2010 4:09 PMRe:...,0


In [3]:
df["text"] = df["text"].map(str)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [5]:
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding=True)

In [6]:
X = df["text"].map(preprocess_function)
y = df["label"]

In [7]:
train_encodings, test_encodings, train_labels, test_labels = train_test_split(X, y, test_size=0.25)

In [8]:
def Text_dict(X):
    X_ = {"input_ids" : [], "attention_mask" : []}
    for i in X:
        X_["input_ids"].append(i["input_ids"])
        X_["attention_mask"].append(i["attention_mask"])

    return X_

In [9]:
def Labels_list(y):
    y_ = []
    for i in y:
        y_.append(i)
    return y_

In [10]:
train_encodings_ = Text_dict(train_encodings)
train_encodings = train_encodings_
test_encodings_ = Text_dict(test_encodings)
test_encodings = test_encodings_

In [11]:
train_labels_ = Labels_list(train_labels)
train_labels = train_labels_
test_labels_ = Labels_list(test_labels)
test_labels = test_labels_

In [12]:
class Fraud_EMail_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings["input_ids"][idx])
        target_ids = torch.tensor(self.labels[idx])
        return {"input_ids": input_ids, "labels": target_ids}

    def __len__(self):
        return len(self.labels)

train = Fraud_EMail_Dataset(encodings=train_encodings, labels=train_labels)
test = Fraud_EMail_Dataset(encodings=test_encodings, labels=test_labels)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
accuracy = evaluate.load("accuracy")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
id2label = {0: "FRAUD", 1: "GENUINE"}
label2id = {"FRAUD": 0, "GENUINE": 1}

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [19]:
training_args = TrainingArguments(
    output_dir="email_fraud_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/3580 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.0847, 'learning_rate': 1.7206703910614527e-05, 'epoch': 0.28}
{'loss': 0.0565, 'learning_rate': 1.4413407821229052e-05, 'epoch': 0.56}
{'loss': 0.0352, 'learning_rate': 1.1620111731843577e-05, 'epoch': 0.84}


  0%|          | 0/597 [00:00<?, ?it/s]

{'eval_loss': 0.013247514143586159, 'eval_accuracy': 0.9976533690915186, 'eval_runtime': 51.652, 'eval_samples_per_second': 57.752, 'eval_steps_per_second': 11.558, 'epoch': 1.0}
{'loss': 0.0188, 'learning_rate': 8.826815642458101e-06, 'epoch': 1.12}
{'loss': 0.0012, 'learning_rate': 6.033519553072626e-06, 'epoch': 1.4}
{'loss': 0.0005, 'learning_rate': 3.240223463687151e-06, 'epoch': 1.68}
{'loss': 0.0021, 'learning_rate': 4.46927374301676e-07, 'epoch': 1.96}


  0%|          | 0/597 [00:00<?, ?it/s]

{'eval_loss': 0.012243790552020073, 'eval_accuracy': 0.9986590680522963, 'eval_runtime': 51.9258, 'eval_samples_per_second': 57.447, 'eval_steps_per_second': 11.497, 'epoch': 2.0}
{'train_runtime': 1188.7244, 'train_samples_per_second': 15.051, 'train_steps_per_second': 3.012, 'train_loss': 0.027849117291872727, 'epoch': 2.0}


TrainOutput(global_step=3580, training_loss=0.027849117291872727, metrics={'train_runtime': 1188.7244, 'train_samples_per_second': 15.051, 'train_steps_per_second': 3.012, 'train_loss': 0.027849117291872727, 'epoch': 2.0})

In [20]:
trainer.save_model()

In [26]:
classifier = pipeline("text-classification", model="./email_fraud_detector/")
joblib.dump(classifier, filename="Email_Fraud_Detector_Model.pkl")

['Email_Fraud_Detector_Model.pkl']