In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
dataset = pd.read_csv('/Users/sachinkarthikeya/Desktop/Projects/ARPA-LLM/Phishing_Email.csv')  

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [4]:
dataset.isnull().sum()

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

In [5]:
dataset.dropna(inplace=True)
dataset.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [6]:
dataset.shape

(18634, 3)

In [7]:
dataset['Email Type'] = dataset['Email Type'].map({'Phishing Email': 1, 'Safe Email': 0})

In [8]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,1,the other side of * galicismos * * galicismo *...,0
2,2,re : equistar deal tickets are you still avail...,0
3,3,\nHello I am your hot lil horny toy.\n I am...,1
4,4,software at incredibly low prices ( 86 % lower...,1


In [9]:
X = dataset['Email Text']
y = dataset['Email Type']

In [10]:
print(X)

0        re : 6 . 1100 , disc : uniformitarianism , re ...
1        the other side of * galicismos * * galicismo *...
2        re : equistar deal tickets are you still avail...
3        \nHello I am your hot lil horny toy.\n    I am...
4        software at incredibly low prices ( 86 % lower...
                               ...                        
18645    date a lonely housewife always wanted to date ...
18646    request submitted : access request for anita ....
18647    re : important - prc mtg hi dorn & john , as y...
18648    press clippings - letter on californian utilit...
18649                                                empty
Name: Email Text, Length: 18634, dtype: object


In [11]:
print(y)

0        0
1        0
2        0
3        1
4        1
        ..
18645    1
18646    0
18647    0
18648    0
18649    1
Name: Email Type, Length: 18634, dtype: int64


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_transformed = feature_extraction.fit_transform(X_train)
X_test_transformed = feature_extraction.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [14]:
model = RandomForestClassifier()
model.fit(X_train_transformed, y_train)

In [15]:
training_prediction = model.predict(X_train_transformed)
training_accuracy = accuracy_score(y_train, training_prediction)
print(f'Training Accuracy: {training_accuracy}')

Training Accuracy: 0.9891326222579996


In [16]:
testing_prediction = model.predict(X_test_transformed)
testing_accuracy = accuracy_score(y_test, testing_prediction)
print(f'Testing Accuracy: {testing_accuracy}')

Testing Accuracy: 0.9664609605580896


In [17]:
input_mail = ["""URL: http://www.livejournal.com/talkread.bml?journal=jwz&itemid=63309
Date: Not suppliedhttp://www.livejournal.com/talkread.bml?journal=jwz&itemid=63309"""]

input_mail_transformed = feature_extraction.transform(input_mail)

prediction = model.predict(input_mail_transformed)

if (prediction[0]==1):
  print('This is a Phishing mail')

else:
  print('This is a Safe mail')

This is a Safe mail


In [18]:
joblib.dump(model, 'phishing_email_model.pkl')
joblib.dump(feature_extraction, 'email_tfidf_vectorizer.pkl')

['email_tfidf_vectorizer.pkl']