<a href="https://colab.research.google.com/github/Shubham-mohapatra/naivebayes-email-spam-detection/blob/main/Naive_bayes_Spam_Email_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas numpy scikit-learn nltk google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client flask joblib




In [6]:
import pandas as pd

df = pd.read_csv('spam.csv',encoding='latin-1')
df = df[['v1','v2']]
df.rename(columns={'v1':'label','v2':'text'},inplace=True)



print(df.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
#Clean the text

import re

def clean_text(text):
  text = re.sub(r'\W',' ',text) # for special chars
  text = re.sub(r'\d',' ',text) # remove the extra digits
  text = re.sub(r'\s+',' ',text) # removes the extra spaces
  text = text.strip().lower()
  return text


df['text']  = df['text'].apply(clean_text)
print(df.head())

  label                                               text
0   ham  go until jurong point crazy available only in ...
1   ham                            ok lar joking wif u oni
2  spam  free entry in a wkly comp to win fa cup final ...
3   ham        u dun say so early hor u c already then say
4   ham  nah i don t think he goes to usf he lives arou...


In [9]:
# remove the stopwords like "is","and","the"

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

  df['text'] = df['text'].apply(remove_stopwords)
  print(df.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
#Tokenization og text into individual texts


def tokenize_text(text):
  return text.split()

df['tokens'] = df['text'].apply(tokenize_text)
print(df.head())

  label                                               text  \
0   ham  go until jurong point crazy available only in ...   
1   ham                            ok lar joking wif u oni   
2  spam  free entry in a wkly comp to win fa cup final ...   
3   ham        u dun say so early hor u c already then say   
4   ham  nah i don t think he goes to usf he lives arou...   

                                              tokens  
0  [go, until, jurong, point, crazy, available, o...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, in, a, wkly, comp, to, win, fa, ...  
3  [u, dun, say, so, early, hor, u, c, already, t...  
4  [nah, i, don, t, think, he, goes, to, usf, he,...  


In [12]:
#using stemming and lemmatization to further reduce the words to their base form

# eg - "running" -> "run"

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

#Stemming
stemmer = PorterStemmer()
def stem_text(text):
  words = text.split()
  words = [stemmer.stem(word) for word in words]
  return ' '.join(words)


print(df.head())

  label                                               text  \
0   ham  go until jurong point crazy available only in ...   
1   ham                            ok lar joking wif u oni   
2  spam  free entry in a wkly comp to win fa cup final ...   
3   ham        u dun say so early hor u c already then say   
4   ham  nah i don t think he goes to usf he lives arou...   

                                              tokens  
0  [go, until, jurong, point, crazy, available, o...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, in, a, wkly, comp, to, win, fa, ...  
3  [u, dun, say, so, early, hor, u, c, already, t...  
4  [nah, i, don, t, think, he, goes, to, usf, he,...  


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df['label'] = df['label'].map({'ham':0,'spam':1})
print(df.head())

   label                                               text  \
0      0  go until jurong point crazy available only in ...   
1      0                            ok lar joking wif u oni   
2      1  free entry in a wkly comp to win fa cup final ...   
3      0        u dun say so early hor u c already then say   
4      0  nah i don t think he goes to usf he lives arou...   

                                              tokens  
0  [go, until, jurong, point, crazy, available, o...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, in, a, wkly, comp, to, win, fa, ...  
3  [u, dun, say, so, early, hor, u, c, already, t...  
4  [nah, i, don, t, think, he, goes, to, usf, he,...  


In [14]:
#spliting the dataset

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['text'],df['label'],test_size=0.2,random_state=42)

In [17]:
#TF-IDF Vetorization

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF vectorized training data shape : ",X_train_tfidf.shape)
print("TF-IDF vectorized testing data shape : ",X_test_tfidf.shape)

TF-IDF vectorized training data shape :  (4457, 6910)
TF-IDF vectorized testing data shape :  (1115, 6910)


In [25]:
#train naive bayes model

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

model = MultinomialNB()
model.fit(X_train_tfidf,y_train)

y_pred = model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


Accuracy: 0.9560538116591928


['tfidf_vectorizer.pkl']