In [1]:
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv("D:\Jupyter\jupyter notebook\datasets\mail_data.csv")

In [9]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#creating a separate column to represent spam and non-spam in binary format
df['spam'] = df['Category'].apply(lambda x: 0 if x == "ham" else 1)

In [11]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
# now creating our Target and feature variables
message = df['Message']
spam = df['spam']

In [13]:
#splitting the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message,spam,test_size = 0.2)

In [14]:
X_train.shape # now we have 4457 training data

(4457,)

In [18]:
# now for changing the dataset into bag-of-words we use count_vectorizer
# it converts the text document into matrix of token counts

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)  # passing the values of X_train not as dataframe



In [24]:
X_train_cv.toarray() 

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
X_train_cv.shape  # now each message is converted into array of 7734 features

(4457, 7734)

In [27]:
# using multinomial to train the model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [28]:
X_test_cv = v.transform(X_test.values)

In [29]:
pred = model.predict(X_test_cv)

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       954
           1       0.98      0.95      0.97       161

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [35]:
# now creating a pipeline for that 
from sklearn.pipeline import Pipeline

Clf = Pipeline([('vectorizer',CountVectorizer()),
                ('model',MultinomialNB())])

In [36]:
Clf.fit(X_train,y_train)

In [37]:
pred = Clf.predict(X_test)

In [38]:
# now printing the report of pipeline
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       959
           1       0.95      0.98      0.97       156

    accuracy                           0.99      1115
   macro avg       0.97      0.99      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [42]:
# testing custom mail to verify it's spam or not
mail = ['Hey Mohan, I think we should work together on this project',
        'upto 20% discount on parking, exclusive offer just for you']
Clf.predict(mail)

# 0 for non-spam,
# 1 for spam

array([0, 1], dtype=int64)