In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saefurukawa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_csv('.src/models/datasets/Phishing_Email.csv',index_col=[0])

FileNotFoundError: [Errno 2] No such file or directory: '.src/models/datasets/Phishing_Email.csv'

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data['Email Type'].unique()

In [None]:
data['Email Type'].value_counts().plot(kind='bar',legend=True)

In [None]:
corpus = []
for text in data['Email Text']:
    email = re.sub('[^a-zA-Z]', ' ', str(text))
    email = email.lower()
    email = email.split()
    stemmer = PorterStemmer() #stemming
    email = [stemmer.stem(word) for word in email if word not in set(stop_words)]
    email = ' '.join(email)
    corpus.append(email)

In [None]:
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

In [None]:
# Train and test data split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
init_models = {
    'gaussian':GaussianNB(),
    'multinomial':MultinomialNB(),
    'compliment':ComplementNB(),
    'bernaulli':BernoulliNB()
}

for key in init_models.keys():
    print('-'*20)
    print(f'Using : {key}')
    clf = init_models.get(key)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion matrix for : {key}\n{cm}")
    print(f"Accuracy score for : {key}\n{accuracy_score(y_test, y_pred)}")
    print('-'*20)

In [None]:
# ComplimentNB seems to perform the best hence going with that model,

from sklearn.model_selection import GridSearchCV
param_grid = {
    'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],         # Laplace smoothing parameter
    'fit_prior': [True, False],       # Whether to learn class prior probabilities or not
}

model = ComplementNB()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validated score: ", grid_search.best_score_)

params = grid_search.best_params_

In [None]:
# MultinomialNB
param_grid = {
    'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],         # Laplace smoothing parameter
    'fit_prior': [True, False],       # Whether to learn class prior probabilities or not
}

model = MultinomialNB()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validated score: ", grid_search.best_score_)

params = grid_search.best_params_

In [None]:
# BernoulliNB 
param_grid = {
    'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],         # Laplace smoothing parameter
    'fit_prior': [True, False],       # Whether to learn class prior probabilities or not
}

model = BernoulliNB()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validated score: ", grid_search.best_score_)

params = grid_search.best_params_

In [None]:
# GaussianNB 
param_grid = {
    'alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],         # Laplace smoothing parameter
    'fit_prior': [True, False],       # Whether to learn class prior probabilities or not
}

model = GaussianNB()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validated score: ", grid_search.best_score_)

params = grid_search.best_params_

In [None]:
#Accuracy score: 95.20 %
complementNB_model = ComplementNB(
    alpha = 0.1,
    fit_prior = True,
)
complementNB_model.fit(X_train, y_train)

In [None]:
def preprocess(email_content):
    # Remove non-alphabetic characters and convert to lowercase
    email = re.sub('[^a-zA-Z]', ' ', email_content.lower())

    # Tokenize the text into words
    words = email.split()

    # Apply stemming by removing common suffixes and remove stop words
    stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]

    # Combine the processed words back into a string
    processed_email = ' '.join(stemmed_words)

    # Return the processed email content as an array (this might vary based on your model's input format)
    return [processed_email]

In [None]:
# Example usage
new_email_content = "This is a sample email. Please check this out."

# Preprocess the new email content
processed_new_email = preprocess(new_email_content)
print(processed_new_email)

# Transform the processed email content using the same CountVectorizer used for training
numerical_features = cv.transform(processed_new_email).toarray()

# Make the prediction using the trained ComplementNB model
prediction = complementNB_model.predict(numerical_features)

# Print the prediction
print(prediction)
