# Importing libraries, loading and transforming data

In [None]:
!pip install -q mlflow nlp

[0m

In [None]:
#imports
import pandas as pd
import gc
import re
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from nlp import Dataset
from imblearn.over_sampling import RandomOverSampler
import datasets
from transformers import pipeline
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()



In [None]:
# set parameters
train_fraction = 0.8 # fraction of a dataset used for training (the rest used for validation)
num_train_epochs = 3 # epochs to train
batch_size = 16 # batch size for training and validation
warmup_steps = 50
weight_decay = 0.02
BERT_MODEL = "distilbert-base-cased"
output_dir = "./phishing-email-detection"

In [None]:
df = pd.read_csv("/kaggle/input/phishingemails/Phishing_Email.csv").drop(['Unnamed: 0'], axis=1).drop_duplicates()
print(df.shape)
df.sample(5).T

(17539, 2)


Unnamed: 0,12206,2702,11296,564,11429
Email Text,On Monday 22 July 2002 07:00 CET John Rudd wro...,**Dublin**: something from the archives. Daev ...,fw : cms rfp response fyi - - - - - - original...,Joseph S. Barrera III:\n>I just use the free/a...,"re : cp & l daren : when you get a chance , st..."
Email Type,Safe Email,Safe Email,Safe Email,Safe Email,Safe Email


In [None]:
# create target
df['label'] = (df['Email Type']=="Phishing Email").astype(int)
df['title'] = df['Email Text']
df = df[['title', 'label']]

print(df.shape)
df.sample(20)

(17539, 2)


Unnamed: 0,title,label
14439,"I've been testing Razor, invoked from sendmail...",0
4179,only if you are serious . . . will i help you ...,1
18158,"fw : can you check a deal for me bill , this e...",0
11429,"re : cp & l daren : when you get a chance , st...",0
12167,What I understood was that the activists on th...,0
13000,"re : xbtkvi , the ala admitted banned cd gover...",1
9157,How do I install / add an additional service p...,0
138,re [ 10 ] : i trust you at six male paranormal...,1
15245,macromedia dreamweaver mx 2004 plus templates ...,1
12336,largest collection of dowlnoadable porn d \ / ...,1


In [None]:
# drop null records
df.dropna(inplace=True)

In [None]:
df['label'].mean()

0.37393089291823467

In [None]:
# random oversampling of minority class - not needed because of adding weights to minority class
y = df[['label']]
df = df.drop(['label'], axis=1)
ros = RandomOverSampler(random_state=83)
df, y_resampled = ros.fit_resample(df, y)
del y
df['label'] = y_resampled
print(df.shape)
del y_resampled
gc.collect()

(21960, 2)


60

In [None]:
medium_dataset = Dataset.from_pandas(df)

In [None]:
del df
gc.collect()

0

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=True, low_cpu_mem_usage=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["title"], truncation=True)

medium_dataset = medium_dataset.map(preprocess_function, batched=True)

  0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
# Dataset has a built in train test split method
medium_dataset = medium_dataset.train_test_split(test_size=1-train_fraction)

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
medium_dataset

{'train': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 17568),
 'test': Dataset(features: {'title': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 4392)}

In [None]:
medium_dataset.remove_column_("title")  # remove the text column because we don't need to keep it in memory anymore
# this is not required but speeds things up a bit

In [None]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the
#  length of the longest element in the batch, making them all the same length.
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenizer.decode(medium_dataset['train'][0]['input_ids'])

"[CLS] <! - - a { text - decoration : none } - - > V. I. P Animal lovers club invite new members! No shit! Only REAL ANIMAL porn! Our super active members send home video and photos every day! Don't miss this offer! CLICK to JOIN US! [ remove my email from mail list ] http : / / xent. com / mailman / listinfo / fork [SEP]"

# Loading and training model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False # Whether the model returns all hidden-states.
)

model.config.id2label = {0: 'SAVE EMAIL', 1: 'PHISHING EMAIL'}

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bia

In [None]:
# number of trainable parameters
print(model.num_parameters(only_trainable=True)/1e6)

65.783042


In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir='./logs',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    logging_first_step=True,
    load_best_model_at_end=True,
    logging_steps=1,
    evaluation_strategy='epoch',
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_steps=1,
    save_strategy='epoch',
    report_to="mlflow",  # log to mlflow
)

# Define the trainer:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=medium_dataset['train'],
    eval_dataset=medium_dataset['test'],
    data_collator=data_collator
)

In [None]:
# Get initial metrics
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.694774866104126,
 'eval_accuracy': 0.4822404371584699,
 'eval_runtime': 42.5773,
 'eval_samples_per_second': 103.154,
 'eval_steps_per_second': 6.459}

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.002,0.04659,0.988843
2,0.0002,0.037073,0.993625
3,0.0001,0.038644,0.992714


TrainOutput(global_step=3294, training_loss=0.04472334078184669, metrics={'train_runtime': 1556.455, 'train_samples_per_second': 33.862, 'train_steps_per_second': 2.116, 'total_flos': 6976623228559680.0, 'train_loss': 0.04472334078184669, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.03707250580191612,
 'eval_accuracy': 0.9936247723132969,
 'eval_runtime': 41.3827,
 'eval_samples_per_second': 106.131,
 'eval_steps_per_second': 6.645,
 'epoch': 3.0}

# Saving the model and checking its performance with a sample title

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_vocabulary(save_directory=output_dir)

('./phishing-email-detection/vocab.txt',)

In [None]:
# make a classification pipeline
pipe = pipeline("text-classification", output_dir, tokenizer=BERT_MODEL)
sample_title = '''Why do employees leave companies — analysis of IBM employee data'''
pipe(sample_title, top_k=None)

[{'label': 'SAVE EMAIL', 'score': 0.9475719332695007},
 {'label': 'PHISHING EMAIL', 'score': 0.05242803692817688}]

In [None]:
# use the upper cased version
sample_title2 = sample_title.upper()
pipe(sample_title2, top_k=None)

[{'label': 'PHISHING EMAIL', 'score': 0.9951817393302917},
 {'label': 'SAVE EMAIL', 'score': 0.00481817964464426}]

In [None]:
# more refined version
sample_title3 = '''Why do employees leave companies?'''
pipe(sample_title3, top_k=None)

[{'label': 'SAVE EMAIL', 'score': 0.606553852558136},
 {'label': 'PHISHING EMAIL', 'score': 0.3934462070465088}]

In [None]:
# finally, save the model to Huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import create_repo
create_repo("dima806/phishing-email-detection", repo_type="model")

RepoUrl('https://huggingface.co/dima806/phishing-email-detection', endpoint='https://huggingface.co', repo_type='model', repo_id='dima806/phishing-email-detection')

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path=output_dir,
    path_in_repo = ".",
    repo_id="dima806/phishing-email-detection",
    repo_type="model"
)

pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/526M [00:00<?, ?B/s]

Upload 17 LFS files:   0%|          | 0/17 [00:00<?, ?it/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/526M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/526M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

'https://huggingface.co/dima806/phishing-email-detection/tree/main/.'