In [45]:
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [46]:
data = pd.read_csv('email spam detection.csv', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.drop_duplicates(inplace=True)
data['v2'] = data['v2'].apply(lambda x: x.lower())

# Tokenization
def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['tokenized_text'] = data['v2'].apply(preprocess)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['tokenized_text'])


In [47]:
#data splitting 

In [48]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, data['v1'], test_size=0.2, random_state=42)

In [50]:
multiNB = MultinomialNB()
multiNB.fit(X_train, Y_train)

BerNB = BernoulliNB()
BerNB.fit(X_train, Y_train)

gaussNB=GaussianNB()
gaussNB.fit(X_train.toarray(), Y_train)

In [51]:
multiNBPre=multiNB.predict(X_test)
BerNBPre=BerNB.predict(X_test)
gaussNBpre = gaussNB.predict(X_test.toarray())


In [52]:
from sklearn.metrics import accuracy_score
print("From Multinomial model",accuracy_score(Y_test,multiNBPre))
print("From Bernoulli model",accuracy_score(Y_test,BerNBPre))
print("From GaussianNB model",accuracy_score(Y_test,gaussNBpre))

From Multinomial model 0.9690522243713733
From Bernoulli model 0.9709864603481625
From GaussianNB model 0.8733075435203095


In [29]:
#the best model for email spam detection is BernoulliNB

In [53]:
import joblib
joblib.dump(BerNB, 'model_joblib.pkl')

['model_joblib.pkl']

In [54]:
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)