In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Implement Naive Bayes

Here, we will use Naive Bayes as our first classifier. We’ll train it on the email features and see how well it performs at detecting phishing emails (or potential human trafficking communications).

In [9]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/cleaned_phishing_emails.csv')

# Use TfidfVectorizer to convert email text (subject and body) to numerical data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Combine 'subject' and 'body' columns into one text column
df['text'] = df['subject'] + " " + df['body']

# Fill missing values in the 'text' column with an empty string
df['text'].fillna('', inplace=True)

# Transform text data into TF-IDF features
X = vectorizer.fit_transform(df['text'])

# Label column
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Naive Bayes classifier
nb = MultinomialNB()

# Train the classifier
nb.fit(X_train, y_train)

# Predict on the test set
y_pred = nb.predict(X_test)

# Print the classification report and accuracy
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna('', inplace=True)


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5198
           1       1.00      0.96      0.98      6549

    accuracy                           0.98     11747
   macro avg       0.98      0.98      0.98     11747
weighted avg       0.98      0.98      0.98     11747

Accuracy: 0.9781220737209501


Naive Bayes Confusion Matrix

# Apply Sentiment Analysis

We'll use VADER for sentiment analysis and add the results as new features for further model building.

In [7]:
# Install vaderSentiment library
!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to extract sentiment scores
def get_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound']  # Use compound score for polarity

# Apply sentiment analysis on 'text' column (combined subject and body)
df['sentiment'] = df['text'].apply(get_sentiment)

# Adding sentiment as a feature in the model
X_with_sentiment = df[['text', 'sentiment']]  # Add both text and sentiment as features

# Transform text data into TF-IDF features
X_sentiment = vectorizer.fit_transform(df['text'])

# Split the data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sentiment, y, test_size=0.3, random_state=42)

# Train Naive Bayes with sentiment data
nb.fit(X_train_s, y_train_s)

# Predict on test data
y_pred_s = nb.predict(X_test_s)

# Evaluate the performance
print("Naive Bayes with Sentiment Analysis - Classification Report:")
print(classification_report(y_test_s, y_pred_s))
print("Accuracy:", accuracy_score(y_test_s, y_pred_s))


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m92.2/126.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Naive Bayes with Sentiment Analysis - Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5198
           1       1.00      0.96      0.98      6549

    accuracy                           0.98     11747
   macro avg       0.98      0.98      0.98     11747
weighted avg       0.98      0.98  

# Compare Results with SVM
Now that we have a baseline with Naive Bayes, we now build an SVM model and see if it performs better.

In [8]:
from sklearn.svm import SVC

# Initialize SVM model
svm = SVC(kernel='linear')

# Train the SVM model
svm.fit(X_train_s, y_train_s)

# Predict using SVM on test data
y_pred_svm = svm.predict(X_test_s)

# Evaluate the SVM model
print("SVM Classification Report:")
print(classification_report(y_test_s, y_pred_svm))
print("Accuracy:", accuracy_score(y_test_s, y_pred_svm))


SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5198
           1       0.99      1.00      1.00      6549

    accuracy                           1.00     11747
   macro avg       1.00      1.00      1.00     11747
weighted avg       1.00      1.00      1.00     11747

Accuracy: 0.9955733378734997


SVM has a higher accuracy than Naive Baiyes Classification therefore we to go SVM