In [4]:
import numpy as np 
import pandas as pd 

In [46]:
df = pd.read_csv('./sample_data/spam_ham_emails.csv' , encoding='latin-1')

In [47]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [49]:
len(df)

5572

In [51]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# Text Preprocessing

In [52]:
import string
mess = 'sample message!...'
nopunc=[char for char in mess if char not in string.punctuation]
nopunc=''.join(nopunc)
print(nopunc)

sample message


In [61]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10]


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [62]:
nopunc.split()

['sample', 'message']

In [63]:
clean_mess=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [64]:
clean_mess

['sample', 'message']

In [65]:
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [67]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


let's "tokenize" these messages. 
--> Converting the normal text strings in to a list of tokens (words that we actually want).


In [68]:
df['message'].apply(text_process)

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, Ã¥Â£750, ...
5568                  [Ã, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: message, Length: 5572, dtype: object

In [69]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Vectorization + Test Train Split
We will first use SciKit Learn's 

1) TfidfVectorizer  - Transforms text to feature vectors that can be used as input to estimator.

2) we split to test and train.

3) CountVectorizer -  This model will convert a collection of text documents to a matrix of token counts.

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer   #1 
    
vect = TfidfVectorizer()

dtm = vect.fit_transform(df['message'])


In [75]:
from sklearn.model_selection import train_test_split  #2
X = df['message']
y = df['label']
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.3 , random_state = 42)

In [76]:
from sklearn.feature_extraction.text import CountVectorizer  #3

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

# Transformation
TfidfTransformer. Transform a count matrix to a normalized tf or tf-idf representation.

In [77]:
from sklearn.feature_extraction.text import  TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3900, 7210)

In [79]:
X_train_tfidf = vect.fit_transform(X_train)

# Model Training - Naive Bayes
With messages represented as vectors, we can finally train our spam/ham classifier.

We will use, the Naive Bayes classifier algorithm as mentioned in our presentation.


In [93]:
X_train.columns = ["label", "message"]

In [94]:
X_train.head()

708     To review and KEEP the fantastic Nokia N-Gage ...
4338                   Just got outta class gonna go gym.
5029    Is there coming friday is leave for pongal?do ...
4921    Hi Dear Call me its urgnt. I don't know whats ...
2592    My friend just got here and says he's upping h...
Name: message, dtype: object

In [97]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train_tfidf, y_train)

In [100]:
from sklearn.pipeline import Pipeline

In [104]:
pipeline= Pipeline([
   ( 'bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB()),
])

In [105]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x7f081b108dd0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

# Predictions & Evaluation

In [108]:
from sklearn.metrics import confusion_matrix,classification_report

In [109]:
predictions = pipeline.predict(X_test)

In [110]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1518
        spam       0.70      1.00      0.83       154

    accuracy                           0.96      1672
   macro avg       0.85      0.98      0.90      1672
weighted avg       0.97      0.96      0.96      1672



In [111]:
from sklearn import metrics 
metrics.accuracy_score(y_test, predictions)

0.9611244019138756

In [113]:
import pickle
with open('model.pkl', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)