In [116]:
import pandas as pd
import string
import nltk
import os
import pickle
import re
from nltk.tokenize import RegexpTokenizer
import sklearn.feature_extraction.text as skft
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skmetrics
import sklearn.pipeline as skpipe
import sklearn.decomposition as skd
import sklearn.naive_bayes as sknb
from nltk.corpus import stopwords
import wordcloud
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score


Load dataset and create separate dataframe for phishing and non phishing emails

In [132]:
def create_phish_df(my_dir): #dataframe for phishing emails with columns as title.content and label as 1(phish)
    titles = []
    contents = []
    labels = []

    for f in os.listdir(os.path.join('phish',my_dir)):
            with open(os.path.join('phish', my_dir, f), 'r') as reader:
                try:
                    c = reader.read()
                except:
                    continue
                contents.append(c)
                titles.append(f)
                labels.append(1)

    df = pd.DataFrame({'title': titles, 'content': contents, 'label': 1},
                        columns = ['label', 'title', 'content'])
    return df
   

In [133]:
phish_email_list = [r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\phish\20051114", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\phish\phishing0", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\phish\phishing1", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\phish\phishing2", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\phish\phishing3"]


In [134]:
phish_lst = []
for phish_folder in phish_email_list:
    phish_lst.append(create_phish_df(phish_folder))


In [135]:
df_phish = pd.concat(phish_lst)
df_phish = df_phish[:5000]

In [121]:
ham_email_list = [r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron3", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron4", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron5", r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron6"]


In [125]:
def create_ham_df(my_dir): #create dataframe for non phishing email set(ham means non phish) and label as 0.
                                      #The other columns are content and title
    titles = []
    contents = []
    labels = []

    for f in os.listdir(os.path.join(my_dir,'ham')):
            with open(os.path.join(my_dir, 'ham', f), 'r') as reader:
                try:
                    c = reader.read()
                except:
                    continue
                contents.append(c)
                titles.append(f)
                labels.append('0')

    df = pd.DataFrame({'title': titles, 'content': contents, 'label': 0},
                        columns = ['label', 'title', 'content'])
    return df

In [136]:
ham_list = []
for ham in ham_email_list:
    ham_list.append(create_ham_df(ham))


In [137]:
df_ham = pd.concat(ham_list)
df_ham = df_ham[:5000]

In [138]:
df_emails = pd.concat([df_ham, df_phish])


In [140]:
df_emails_train, df_emails_test = train_test_split(df_emails, test_size=0.3, random_state=0)

 First text data is cleaned, by tokenising and lemmatising the text using wordnet, removing stop words,removing non-alphabetic strings. Bag-of-words approach (BOW) is used.  We look at the histogram of the words within the text, i.e. considering each word count as a feature.The intuition is that documents are similar if they have similar content. Further, that from the content alone we can learn something about the meaning of the document.

In [142]:
#frequency distribution
text_all = '\n'.join(df_emails_train.content).lower()
stop_words = set(stopwords.words('english')) #remove stopwords like a,an,for,the etc from the text
tokenizer = RegexpTokenizer(r'\w+')#nltk.tokenize.wordpunct_tokenize(text_all)
tokens_all = tokenizer.tokenize(text_all)
tokens_all = [word for word in tokens_all if word not in stop_words and word != 'font' and word != 'subject']#word not in string.punctuation

fd = nltk.probability.FreqDist(tokens_all)

phish_text_all = '\n'.join(df_phish.content).lower()
phish_tokens_all = tokenizer.tokenize(phish_text_all)
phish_tokens_all = [word for word in phish_tokens_all if word not in stop_words and word != 'font' and word != 'subject']

fd_phish = nltk.probability.FreqDist(phish_tokens_all)

pipeline = skpipe.Pipeline(
    steps = [('vect', skft.CountVectorizer(max_df=0.7)), #convert to numerical feature vectors
    ('tfidf', skft.TfidfTransformer()), #term frequency ,inverse document frequency
    ('clf', sknb.MultinomialNB())]) #multinomial naive bayes classification

df_emails_train, df_emails_test = train_test_split(df_emails, test_size=0.3, random_state=0)
pipeline.fit(df_emails_train.content, df_emails_train.label)

nb_test_predicted = pipeline.predict(df_emails_test.content)

titles = []
contents = []
labels = []

for f in os.listdir(os.path.join(r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron5",'spam')):
        with open(os.path.join(r"C:\Users\hp\Downloads\Phishing-Detection-master\Phishing-Detection-master\enron5", 'spam', f), 'r') as reader:
            try:
                c = reader.read()
            except:
                continue
            contents.append(c)
            titles.append(f)
            labels.append(0)

df_spam = pd.DataFrame({'title': titles, 'content': contents, 'label': 0},
                    columns = ['label', 'title', 'content'])

predictions = pipeline.predict(df_spam.content)

df_spam['predicted_label'] = predictions

print(df_spam)

      label                               title  \
0         0  0002.2001-05-25.SA_and_HP.spam.txt   
1         0  0004.2001-06-12.SA_and_HP.spam.txt   
2         0  0005.2001-06-23.SA_and_HP.spam.txt   
3         0  0006.2001-06-25.SA_and_HP.spam.txt   
4         0  0008.2001-06-25.SA_and_HP.spam.txt   
5         0  0009.2001-06-26.SA_and_HP.spam.txt   
6         0  0010.2001-06-28.SA_and_HP.spam.txt   
7         0  0011.2001-06-29.SA_and_HP.spam.txt   
8         0  0013.2001-06-30.SA_and_HP.spam.txt   
9         0  0014.2001-07-04.SA_and_HP.spam.txt   
10        0  0015.2001-07-05.SA_and_HP.spam.txt   
11        0  0016.2001-07-06.SA_and_HP.spam.txt   
12        0  0018.2001-07-13.SA_and_HP.spam.txt   
13        0  0020.2001-07-28.SA_and_HP.spam.txt   
14        0  0022.2001-08-01.SA_and_HP.spam.txt   
15        0  0023.2001-08-01.SA_and_HP.spam.txt   
16        0  0024.2001-08-01.SA_and_HP.spam.txt   
17        0  0025.2001-08-01.SA_and_HP.spam.txt   
18        0  0028.2001-08-02.SA

In [143]:
print(predictions)

[1 0 0 ... 0 0 0]


In [144]:
x= list(df_spam.label)
y= list(predictions)
print(len(x))
print(len(y))
results = confusion_matrix(x,y) 
#print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(x,y))
#print ('Report : ')
print(classification_report(x,y))

3675
3675
[[2700  975]
 [   0    0]]
Accuracy Score : 0.7346938775510204
             precision    recall  f1-score   support

          0       1.00      0.73      0.85      3675
          1       0.00      0.00      0.00         0

avg / total       1.00      0.73      0.85      3675



In [80]:
predictions1=list(nb_test_predicted)
test_label=list(df_emails_test.label)
results = confusion_matrix(test_label, predictions1) 
#print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(test_label,predictions1))
#print ('Report : ')
print(classification_report(test_label, predictions1))

[[1482    3]
 [ 114 1395]]
Accuracy Score : 0.9609218436873748
             precision    recall  f1-score   support

        ham       0.93      1.00      0.96      1485
      phish       1.00      0.92      0.96      1509

avg / total       0.96      0.96      0.96      2994



In [146]:
import pickle
filename = 'finalized1_text_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)