### Importing libraries

In [81]:
# Data Wrangling
import numpy as np
import pandas as pd
import re

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# NLP Libraries
import nltk
from nltk.corpus import stopwords

# Data visualization
import matplotlib.pyplot as mplt
import seaborn as sb

### Importing Dataset

In [82]:
# Read Phishing Email Data Set from Kaggle
phishing_dt = pd.read_csv('./Phishing_Email.csv')
phishing_dt.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


### EDA

In [83]:
# Print total rows and columns in the data set
rows = len(phishing_dt.index)
columns = len(phishing_dt.columns)

print("Total Rows: ",rows)
print("Total Columns: ",columns)

Total Rows:  18650
Total Columns:  3


In [84]:
# Checking data type on every attribute
phishing_dt.dtypes

Unnamed: 0     int64
Email Text    object
Email Type    object
dtype: object

In [85]:
# Generate descriptive statistics of dataset
phishing_dt.describe(include = 'all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,18650.0,,,,9325.154477,5384.327293,0.0,4662.25,9325.5,13987.75,18650.0
Email Text,18634.0,17537.0,empty,533.0,,,,,,,
Email Type,18650.0,2.0,Safe Email,11322.0,,,,,,,


Checking for missing values

In [86]:
# Missing values in data set
print('Summary of missing values for every attribute:')
print(phishing_dt.isnull().sum())
print()

missing_val = pd.DataFrame(phishing_dt.isnull().sum())

missing_data = 0
for i in phishing_dt.isnull().sum():
    missing_data += i

if missing_data == 0:
    print('No Missing Values in Data')
else:
    print('List of attributes with missing values:')
    dataset_null_val = dict(phishing_dt.isnull().sum())
    for i in dataset_null_val:
        if dataset_null_val[i]>0:
            print(i, ':', dataset_null_val[i])

Summary of missing values for every attribute:
Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

List of attributes with missing values:
Email Text : 16


Finding other unnecessary values

In [87]:
# Check if "Email Text" column has body = 'empty'
empty_email_text_rows = phishing_dt[phishing_dt['Email Text'] == 'empty'].shape[0]

if empty_email_text_rows > 0:
    print(f'There are {empty_email_text_rows} rows with an empty value in the "Email Text" column.')
else:
    print('No rows have an empty value in the "Email Text" column.')

There are 533 rows with an empty value in the "Email Text" column.


### Data Preprocessing

We are cleaning data by dropping the rows that have null value.

In [88]:
phishing_dt.fillna(pd.NA, inplace=True)

cleaned_phishing_dt = phishing_dt.dropna()

cleaned_phishing_dt

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...,...
18645,18646,date a lonely housewife always wanted to date ...,Phishing Email
18646,18647,request submitted : access request for anita ....,Safe Email
18647,18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18648,18649,press clippings - letter on californian utilit...,Safe Email


Dropping the rows that have value 'empty' in the Email Text Column

In [89]:
cleaned_phishing_dt[cleaned_phishing_dt['Email Text'] == 'empty'] = pd.NA

final_cleaned_phishing_dt = cleaned_phishing_dt.dropna()

final_cleaned_phishing_dt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_phishing_dt[cleaned_phishing_dt['Email Text'] == 'empty'] = pd.NA


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0.0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1.0,the other side of * galicismos * * galicismo *...,Safe Email
2,2.0,re : equistar deal tickets are you still avail...,Safe Email
3,3.0,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4.0,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...,...
18644,18645.0,\nRick Moen a Ã©crit:> > I'm confused. I thou...,Safe Email
18645,18646.0,date a lonely housewife always wanted to date ...,Phishing Email
18646,18647.0,request submitted : access request for anita ....,Safe Email
18647,18648.0,"re : important - prc mtg hi dorn & john , as y...",Safe Email


Importing some NLP for grammar analysis and preprocessing

In [90]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Column cleaning helper function


In [91]:
def clean_column(data):
    if data is not None:
        stopwords_list = stopwords.words('english')
        #exclusions = ['RE:', 'Re:', 're:']
        #exclusions = '|'.join(exclusions)
        data =  data.lower()
        data = re.sub('re:', '', data)
        data = re.sub('re :', '', data)
        data = re.sub('-', '', data)
        data = re.sub('_', '', data)
        # Remove data between square brackets
        data =re.sub('\[[^]]*\]', '', data)
        # removes punctuation
        data = re.sub(r'[^\w\s]','',data)
        data = re.sub(r'\n',' ',data)
        data = re.sub(r'[0-9]+','',data)
        # strip html
        p = re.compile(r'<.*?>')
        data = re.sub(r"\'ve", " have ", data)
        data = re.sub(r"can't", "cannot ", data)
        data = re.sub(r"n't", " not ", data)
        data = re.sub(r"I'm", "I am", data)
        data = re.sub(r" m ", " am ", data)
        data = re.sub(r"\'re", " are ", data)
        data = re.sub(r"\'d", " would ", data)
        data = re.sub(r"\'ll", " will ", data)

        data = p.sub('', data)
        if 'forwarded by:' in data:
            data = data.split('subject')[1]
        data = data.strip()
        return data
    return 'No Subject'

Creating Final Dataset with all columns for similarity calculation

In [92]:
final_cleaned_phishing_dt['Email Text New'] = final_cleaned_phishing_dt['Email Text'].apply(clean_column)

final_cleaned_phishing_dt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_cleaned_phishing_dt['Email Text New'] = final_cleaned_phishing_dt['Email Text'].apply(clean_column)


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,Email Text New
0,0.0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hud...
1,1.0,the other side of * galicismos * * galicismo *...,Safe Email,the other side of galicismos galicismo is ...
2,2.0,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets are you still available ...
3,3.0,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello i am your hot lil horny toy i am the...
4,4.0,software at incredibly low prices ( 86 % lower...,Phishing Email,software at incredibly low prices lower d...
...,...,...,...,...
18644,18645.0,\nRick Moen a Ã©crit:> > I'm confused. I thou...,Safe Email,rick moen a ãcrit im confused i thought it w...
18645,18646.0,date a lonely housewife always wanted to date ...,Phishing Email,date a lonely housewife always wanted to date ...
18646,18647.0,request submitted : access request for anita ....,Safe Email,request submitted access request for anita d...
18647,18648.0,"re : important - prc mtg hi dorn & john , as y...",Safe Email,important prc mtg hi dorn john as you disco...


In [93]:
safe_emails = final_cleaned_phishing_dt[final_cleaned_phishing_dt['Email Type'] == 'Safe Email']
phishing_emails = final_cleaned_phishing_dt[final_cleaned_phishing_dt['Email Type'] == 'Phishing Email']

Vectorization for generating tokens for Similarity analysis

In [94]:
vectorizer = TfidfVectorizer()
tfidf_matrix_all = vectorizer.fit_transform(final_cleaned_phishing_dt['Email Text New'])
tfidf_matrix_phishing = vectorizer.transform(final_cleaned_phishing_dt[final_cleaned_phishing_dt['Email Type'] == 'Phishing Email']['Email Text New'])

Performing Cosine Similarity for feature extraction

In [95]:
cosine_similarities = cosine_similarity(tfidf_matrix_all, tfidf_matrix_phishing)

final_cleaned_phishing_dt['Phishing Similarity'] = cosine_similarities[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_cleaned_phishing_dt['Phishing Similarity'] = cosine_similarities[:, 1]


Final processed form of dataset

In [100]:
final_cleaned_phishing_dt

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,Email Text New,Phishing Similarity
0,0.0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,disc uniformitarianism sex lang dick hud...,0.011557
1,1.0,the other side of * galicismos * * galicismo *...,Safe Email,the other side of galicismos galicismo is ...,0.019583
2,2.0,re : equistar deal tickets are you still avail...,Safe Email,equistar deal tickets are you still available ...,0.005868
3,3.0,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,hello i am your hot lil horny toy i am the...,0.020386
4,4.0,software at incredibly low prices ( 86 % lower...,Phishing Email,software at incredibly low prices lower d...,1.000000
...,...,...,...,...,...
18644,18645.0,\nRick Moen a Ã©crit:> > I'm confused. I thou...,Safe Email,rick moen a ãcrit im confused i thought it w...,0.014114
18645,18646.0,date a lonely housewife always wanted to date ...,Phishing Email,date a lonely housewife always wanted to date ...,0.002804
18646,18647.0,request submitted : access request for anita ....,Safe Email,request submitted access request for anita d...,0.000000
18647,18648.0,"re : important - prc mtg hi dorn & john , as y...",Safe Email,important prc mtg hi dorn john as you disco...,0.026194


Exporting the processed dataset

In [96]:
final_cleaned_phishing_dt.to_csv('./Phishing_Email_Sim.csv', index=False)