<a href="https://colab.research.google.com/github/TaniyaAgrawal17/dev-ada-phising/blob/main/Phishing_Email_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import ssl
import os
from bs4 import BeautifulSoup


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Download dataset

In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=phishing_email.csv")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: carolineef
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset
Downloading phishing-email-dataset.zip to ./phishing-email-dataset


100%|██████████| 77.1M/77.1M [00:00<00:00, 125MB/s]





In [None]:
print(os.listdir("/content"))
print(os.listdir("/content/phishing-email-dataset"))

['.config', 'phishing-email-dataset', 'sample_data']
['Ling.csv', 'CEAS_08.csv', 'phishing_email.csv', 'SpamAssasin.csv', 'Enron.csv', 'Nigerian_Fraud.csv', 'Nazario.csv']


In [None]:
df = pd.read_csv("/content/phishing-email-dataset/phishing_email.csv")
df.drop_duplicates(inplace=True)

# Cleaning data

In [None]:
def preprocess_text(text, unwanted_terms):
    text = text.lower()
    # Removes numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    # Removes HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Removes dates
    text = re.sub(r'\b(?:mon|tue|wed|thu|fri|sat|sun)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b', '', text, flags=re.IGNORECASE)
    # Removes terms that frequently show up but have no meaning
    for term in unwanted_terms:
        text = re.sub(rf'\b{re.escape(term)}\b', '', text, flags=re.IGNORECASE)
    # Removes punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Removes extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Removes words that are underscores
    text = re.sub(r'\b_+\b', '', text)

    return text

unwanted_terms = ['enron', 'hpl', 'nom', 'forwarded', '2008', '10', 'hplno', 'xls']
df['text_combined'] = df['text_combined'].apply(lambda x: preprocess_text(x, unwanted_terms))

def get_top_n_words(text, n=10):
    vectorizer = CountVectorizer(stop_words='english')
    word_count = vectorizer.fit_transform(text)
    word_freq = dict(zip(vectorizer.get_feature_names_out(), word_count.sum(axis=0).A1))
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:n]

phishing_text = df[df['label'] == 1]['text_combined']
non_phishing_text = df[df['label'] == 0]['text_combined']

# Top 10 Words
phishing_top_words = get_top_n_words(phishing_text, n=10)
non_phishing_top_words = get_top_n_words(non_phishing_text, n=10)

# Convert to dataframe
phishing_df = pd.DataFrame(phishing_top_words, columns=['Word', 'Frequency'])
non_phishing_df = pd.DataFrame(non_phishing_top_words, columns=['Word', 'Frequency'])

# print("Top 10 Words in Phishing Emails:")
# for word, freq in phishing_top_words:
#     print(f"{word}: {freq}")

# print("\nTop 10 Words in Non-Phishing Emails:")
# for word, freq in non_phishing_top_words:
#     print(f"{word}: {freq}")

# # Word Cloud for Phishing Emails
# wordcloud_phishing = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(dict(phishing_top_words))
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud_phishing, interpolation="bilinear")
# plt.axis("off")
# plt.title("Word Cloud - Phishing Emails")
# plt.show()

# # Word Cloud for Non-Phishing Emails
# wordcloud_non_phishing = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(dict(non_phishing_top_words))
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud_non_phishing, interpolation="bilinear")
# plt.axis("off")
# plt.title("Word Cloud - Non-Phishing Emails")
# plt.show()

# TF-IDF

In [None]:
# TF_IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# X is an input matrix where each row represents an email and each column represents a word's importance
X = vectorizer.fit_transform(df['text_combined'])
y = df['label']

# print(df['text_combined'].head(10))
# print(vectorizer.get_feature_names_out()[:50])

# Create data frame of tfidf matrix
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())


# Training

In [None]:
# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the model
accuracy = svm_model.score(X_test, y_test)
print(accuracy)

0.9809941520467836
