## Spam mail detector using Multinomial Naive Bayes Classifier

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('~/my_code/applied_ai_notebooks/datasets/spam.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
df['spam'] = df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [6]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [7]:
#split dataset for train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train

2985    He said that he had a right giggle when he saw...
4633                   I'm in a meeting, call me later at
4133    Hi baby ive just got back from work and i was ...
5474    Where's mummy's boy ? Is he being good or bad ...
1725              There bold 2  &lt;#&gt; . Is that yours
                              ...                        
3313    I know you are serving. I mean what are you do...
796      it's really getting me down just hanging around.
3765    Would me smoking you out help us work through ...
1997                     Lol boo I was hoping for a laugh
2514    U have won a nokia 6230 plus a free digital ca...
Name: Message, Length: 4457, dtype: object

In [9]:
# machine knows  numbers so we will convert message into numbers

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [11]:
x_train_count = vect.fit_transform(X_train.values)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [13]:
x_test_count = vect.transform(X_test)

In [14]:
model.score(x_test_count,y_test)

0.9865470852017937

## Lets do same thing using pipeline

We can reduce steps of transforming dataset column format by using sklearn.pipeline

In [15]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('countv', CountVectorizer()),
                 ('nbmodel',MultinomialNB())
                ])

In [16]:
# using this method, our training sets will first go through countvectorizer 
# and then Multinomial classifier
pipe.fit(X_train, y_train)

Pipeline(steps=[('countv', CountVectorizer()), ('nbmodel', MultinomialNB())])

In [17]:
pipe.score(X_test, y_test)

0.9865470852017937

In [24]:
y_pred = pipe.predict(X_test)

In [25]:
y_test[:10]

5565    0
4384    0
299     0
5117    0
5089    0
1350    1
3596    0
4091    1
3818    0
910     1
Name: spam, dtype: int64

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       0.98      0.92      0.95       150

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [27]:
message = "Congratulations to our Master of the Year Tournament winners! Help us celebrate:  @wilmanaro, @jcervay, @ orient_arts, @annaradchenko1 and @universalsouls_art  for winning our Master of the Year Tournament. You can check out their winning images below!"

In [30]:
X_test

5565                                         Huh y lei...
4384    No need lar i go engin? Cos my sis at arts tod...
299     I cant pick the phone right now. Pls send a me...
5117    Aslamalaikkum....insha allah tohar beeen muht ...
5089                      What type of stuff do you sing?
                              ...                        
2143                      You see the requirements please
5302    About  &lt;#&gt; bucks. The banks fees are fix...
1073    Dear U've been invited to XCHAT. This is our f...
912                               I am sorry it hurt you.
1814                             Yes we are chatting too.
Name: Message, Length: 1115, dtype: object