In [53]:
# Importing Libraries

import pandas as pd 
import re
import nltk
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
from nltk.stem import WordNetLemmatizer


In [54]:
df = pd.read_csv('spam.csv', encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [55]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [56]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [57]:
df.rename(columns={'v1':'class', 'v2':'message'}, inplace=True)
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [58]:
df['label'] = df['class'].map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,class,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [59]:
lemmatizer = WordNetLemmatizer()

In [60]:
corpus = []
for i in range(len(df)):
    msg = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    msg = msg.lower()
    msg = msg.split()
    msg = [lemmatizer.lemmatize(word) for word in msg if not word in stopwords.words('english')]
    msg = ' '.join(msg)
    corpus.append(msg)

In [61]:
corpus[0:1]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat']

In [62]:
cv = CountVectorizer()
#Fit the feature
X = cv.fit_transform(corpus).toarray()
y = df['label']

In [63]:
import pickle

# now pickle
pickle.dump(cv, open("cv.pickel", "wb"))

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [65]:
# Using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [66]:
#Training the model
clf.fit(X=X_train, y=y_train)

MultinomialNB()

In [67]:
# Scoring the model
clf.score(X_test,y_test)

0.9796650717703349

In [83]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [84]:
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 0, 1])

In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1457
           1       0.90      0.94      0.92       215

    accuracy                           0.98      1672
   macro avg       0.95      0.96      0.96      1672
weighted avg       0.98      0.98      0.98      1672



### Creating the model inference 

In [76]:
pickle.dump(clf, open("NB_spam_model.pickel", "wb"))

In [70]:
# import joblib
# joblib.dump(clf, 'NB_spam_model.pkl')

['NB_spam_model.pkl']