In [None]:
pip install accelerate -U

In [None]:
pip install transformers[torch]

In [None]:
pip install evaluate

In [None]:
import numpy as np
import pandas as pd
import evaluate
from sklearn.model_selection import train_test_split
import re
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('/content/Phishing_Email.csv')

In [None]:
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

In [None]:
dataset['Email_Text'] = dataset['Email Text'].apply(clean_text)

In [None]:
label_encoder = LabelEncoder()
dataset['Email_Type'] = label_encoder.fit_transform(dataset['Email Type'])

In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(dataset['Email_Text'], dataset['Email_Type'], test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenize_train_dataset = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
tokenize_val_dataset = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)
tokenize_test_dataset = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

In [None]:
class ConvertToEmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = ConvertToEmailDataset(tokenize_train_dataset, train_labels.tolist())
val_dataset = ConvertToEmailDataset(tokenize_val_dataset, val_labels.tolist())
test_dataset = ConvertToEmailDataset(tokenize_test_dataset, test_labels.tolist())

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(dataset['Email_Type'])))

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
args = TrainingArguments(
    output_dir='/content/results',
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/logs',
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

BertModel = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
BertModel.train()

In [None]:
results = BertModel.evaluate()
print(results)

In [None]:
test_results = BertModel.evaluate(test_dataset)
print(test_results)

In [None]:
BertModel.save_model('/content/BertModel')

In [None]:
new_texts = ['new book japanese linguistics japanese linguistics hamano shoko the george washington university the sound symbolic system of japanese isbn 1 57586 144 5 paper 1 57586 144 5 cloth csli publications 1998 http csli www stanford edu publications email pubs roslin stanford edu this book is the first theoretical study of sound symbolic expressions in japanese commonly known as mimetic words it identifies stringent linguistic constraints on these expressions and demonstrates that they form an intricate linguistic system rather than a collection of ad hoc expressions it then carefully identifies the sound symbolic meanings of sound units so as to make the elusive meaning of each sound symbolic expression fully comprehensible in addition this book describes a number of interesting facts about the history of the japanese language which mimetic words reveal csli publications ventura hall stanford university stanford ca 94305 4115 telephone 650 723 1839 fax 650 725 2166 http csli www stanford edu publications',
             'hot stock tip your broker won t share now that oi and gas has entered a long term bul market our speciaity in pinpointing the hottest companies of the few remaining undervaiued energy piays has produced soaring returns montana oi and gas inc mogi to expiore further opportunities in alberta canada a is an energy developer in canada s most highly coveted reservoirs with generating potentia of mi ions per week symbo mogi price 47 increased 11 last three day rating strongbuy how much it wiil up again the vaiue of mogi s shares wil skyrocket 1 price charts confirm oi prices are experiencing the strongest bul market in a generation 2 natural gas prices have tripled in the ast two years 3 with multiple projects in high gear and the expanding production on reserves worth muiti miilions mogi is seiling for ess than 1 4 the vaiue of its assets 4 montana oil and gas specializes in using new technoiogy to turn unproductive oil and gas deposits into profitable enterprises aiready shares in the oil and gas sector are rising faster than the overa market']

new_text_tokenize = tokenizer(new_texts, truncation=True, padding=True, max_length=512)

class ConvertToPredictDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

new_dataset = ConvertToPredictDataset(new_text_tokenize)
predictions = BertModel.predict(new_dataset)

predicted_labels = label_encoder.inverse_transform(predictions.predictions.argmax(-1))
print(predicted_labels)