In [None]:
# https://www.kaggle.com/code/abdmental01/nlp-email-spam-detection-a-beginner-s-guide#The-End-%7C-Upvote-If-you-found-Notebook-Useful

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Punctuations
import string
# Pandas
import pandas as pd
# Remove Stopwords
from nltk.corpus import stopwords 
# Regular Expressions
import re
# Import PorterStemmer from NLTK Library
from nltk.stem.porter import PorterStemmer
# Models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# Metrix and Train Test
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

df_train = pd.read_csv('data/mail_data.csv')
df_train.head()

In [None]:
print(f'Null Values in Train Dataset is : {df_train.isnull().sum().sum()}') # null check

In [None]:
print(f"Duplicates values in Train Dataset is : {df_train.duplicated().sum()}") # duplicated check

In [None]:
df_train.drop_duplicates(inplace=True)

In [None]:
df_train.isnull().sum()


## Text Preprocessing

In [None]:
df_train['Message'] = df_train['Message'].str.lower()
df_train['Message'] = df_train['Message'].str.replace('#','')
df_train['Message'] = df_train['Message'].str.replace('@','')
df_train['Message'] = df_train['Message'].str.replace(r'^https?:\/\/.*[\r\n]*','')

import string
df_train['Message'] = df_train['Message'].str.translate(str.maketrans('', '', string.punctuation))
stop_words = stopwords.words('english')
df_train['Message'] = df_train['Message'].apply(lambda x : ' '.join([word for word in x.split()if word not in (stop_words)]))

from nltk.tokenize import sent_tokenize
df_train['text_sent_token'] = df_train['Message'].apply(sent_tokenize)

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df_train['stem_msg'] = df_train['Message'].apply(stem_words)
df_train.head()

## Vectorizing Arrays

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(df_train['stem_msg']).toarray()

y = df_train['Category']

print("X shape -> ", X.shape)
print("y shape -> ", y.shape)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50, random_state=2)

svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

knc.fit(X_train, y_train)
knn_pred = knc.predict(X_test)

mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)

dtc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_test)

lrc.fit(X_train, y_train)
lrc_pred = lrc.predict(X_test)

rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

abc.fit(X_train, y_train)
abc_pred = abc.predict(X_test)

etc.fit(X_train, y_train)
etc_pred = etc.predict(X_test)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
svc.fit(X_train ,y_train)
svc_pred = svc.predict(X_test)
knc.fit(X_train ,y_train)
knn_pred = knc.predict(X_test)
mnb.fit(X_train ,y_train)
mnb_pred = mnb.predict(X_test)
dtc.fit(X_train ,y_train)
dtc_pred = dtc.predict(X_test)
lrc.fit(X_train ,y_train)
lrc_pred = lrc.predict(X_test)
rfc.fit(X_train ,y_train)
rfc_pred = rfc.predict(X_test)
abc.fit(X_train ,y_train)
abc_pred = abc.predict(X_test)
etc.fit(X_train ,y_train)
etc_pred = etc.predict(X_test)
xgb.fit(X_train ,y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

def evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    
    return accuracy, precision, confusion


accuracy_SVC, precision_SVC, confusion_SVC = evaluate(y_test, svc_pred)
print(f"The Accuracy Score Of SVC is {accuracy_SVC}, Precision Is {precision_SVC},\nConfusion Matrix is \n{confusion_SVC} ")

accuracy_KNN, precision_KNN, confusion_KNN = evaluate(y_test, knn_pred)
print(f"The Accuracy Score Of KNN is {accuracy_KNN}, Precision Is {precision_KNN},\nConfusion Matrix is \n{confusion_KNN} ")

accuracy_MNB, precision_MNB, confusion_MNB = evaluate(y_test, mnb_pred)
print(f"The Accuracy Score Of MultinomialNB is {accuracy_MNB}, Precision Is {precision_MNB},\nConfusion Matrix is \n{confusion_MNB} ")

accuracy_DTC, precision_DTC, confusion_DTC = evaluate(y_test, dtc_pred)
print(f"The Accuracy Score Of Decision Tree is {accuracy_DTC}, Precision Is {precision_DTC},\nConfusion Matrix is \n{confusion_DTC} ")

accuracy_LR, precision_LR, confusion_LR = evaluate(y_test, lrc_pred)
print(f"The Accuracy Score Of Logistic Regression is {accuracy_LR}, Precision Is {precision_LR},\nConfusion Matrix is \n{confusion_LR} ")

accuracy_RF, precision_RF, confusion_RF = evaluate(y_test, rfc_pred)
print(f"The Accuracy Score Of Random Forest Classifier is {accuracy_RF}, Precision Is {precision_RF},\nConfusion Matrix is \n{confusion_RF} ")

accuracy_ADC, precision_ADC, confusion_ADC = evaluate(y_test, abc_pred)
print(f"The Accuracy Score Of AdaBoost Classifier is {accuracy_ADC}, Precision Is {precision_ADC},\nConfusion Matrix is \n{confusion_ADC} ")

accuracy_ETC, precision_ETC, confusion_ETC = evaluate(y_test, etc_pred)
print(f"The Accuracy Score Of Extra Tree Classifier is {accuracy_ETC}, Precision Is {precision_ETC},\nConfusion Matrix is \n{confusion_ETC} ")

accuracy_XGB, precision_XGB, confusion_XGB = evaluate(y_test, xgb_pred)
print(f"The Accuracy Score Of XGBoost Classifier is {accuracy_XGB}, Precision Is {precision_XGB},\nConfusion Matrix is \n{confusion_XGB} ")