In [127]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

### 1. Load the dataset

In [387]:
# SMSSpamCollection

df = pd.read_table("SMSSpamCollection", header=None, encoding="utf-8")

In [388]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [389]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [390]:
df_ham = df[df[0]=="ham"][:1000]
df_spam = df[df[0]=="spam"]

In [391]:
# df_ham
df = pd.concat([df_ham, df_spam], axis=0)

In [392]:
# df

In [393]:
## Check the class distribution

classes = df[0]
print(classes.value_counts())

ham     1000
spam     747
Name: 0, dtype: int64


## 2. Pre-process the data

In [394]:
## convert the class to  binary value, 0=spam, 1= ham

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

Y = encoder.fit_transform(classes)
print(Y[:10])
print(classes[:10])

[0 0 0 0 0 0 0 0 0 0]
0     ham
1     ham
3     ham
4     ham
6     ham
7     ham
10    ham
13    ham
14    ham
16    ham
Name: 0, dtype: object


In [395]:
## Store the sms message data

text_messages = df[1]
text_messages[:10]

0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
6     Even my brother is not like to speak with me. ...
7     As per your request 'Melle Melle (Oru Minnamin...
10    I'm gonna be home soon and i don't want to tal...
13    I've been searching for the right words to tha...
14                  I HAVE A DATE ON SUNDAY WITH WILL!!
16                           Oh k...i'm watching here:)
Name: 1, dtype: object

In [396]:
## Regular expression to switch text from message, eg: email, phone, currency, urls

## email address with emailaddr

processed = text_messages.str.replace(r"^.+@[^\.].*\.[a-z]{2,}$", "emailaddr")

## replace urls with webaddrs

processed = processed.str.replace(r"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$", "webaddrs")

## replace currency with moneysymb 
processed = processed.str.replace(r"$", "moneysymb")

## replace phone number with phonenumbr 
processed = processed.str.replace(r"/^[0-9]\d{2,4}-\d{6,8}$/", "phonenumbr")

## replace normal numbers with numbr 
processed = processed.str.replace(r"\d+(\.\d+)?", "numbr")



In [397]:
## Remove punctuation
processed = processed.str.replace(r"[^\w\d\s]", " ")

## Remove spacess
processed = processed.str.replace(r"\s+", " ")

# Leadind and trailing whitespaces

processed = processed.str.replace(r"^\s+|\s+?$", "")


In [398]:
## Changing the word to lower case
processed = processed.str.lower()

In [399]:
## Remove Stop word 

# from nltk.corpus import stopwords
# nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))

from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
# from nltk.tokenize import word_tokenize 

  
# example_sent = "This is a sample sentence, showing off the stop words filtration."
  
stop_words = set(stopwords.words('english'))


# stopwords

In [400]:
tags = Y
texts_ = processed

In [401]:
texts_, tags

(0       go until jurong point crazy available only in ...
 1                                 ok lar joking wif u oni
 3             u dun say so early hor u c already then say
 4       nah i don t think he goes to usf he lives arou...
 6       even my brother is not like to speak with me t...
                               ...                        
 5537    want explicit sex in numbr secs ring numbr now...
 5540    asked numbrmobile if numbr chatlines inclu in ...
 5547    had your contract mobile numbr mnths latest mo...
 5566    reminder from onumbr to get numbr pounds free ...
 5567    this is the numbrnd time we have tried numbr c...
 Name: 1, Length: 1747, dtype: object, array([0, 0, 0, ..., 1, 1, 1]))

In [402]:
texts = []
for x in texts_:
    try:
        review = re.sub('[^a-zA-Z]',  x)  

        # convert all cases to lower cases 
        review = review.lower()  

        # split to array(default delimiter is " ") 
        review = review.split()  

        # creating PorterStemmer object to 
        # take main stem of each word 
        ps = PorterStemmer()  

        # loop for stemming each word 
        # in string array at ith row     
        review = [ps.stem(word) for word in review 
                    if not word in set(stopwords.words('english'))]  

        # rejoin all string array elements 
        # to create back into a string 
        review = ' '.join(review)
        texts.append(review)
    except:
        texts.append(x)

In [403]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from keras import metrics

In [404]:
texts = [str(x) for x in texts]

In [405]:
# texts

In [406]:
num_max = 10
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)

In [407]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(texts,tags, test_size = 0.1,random_state=101)
mat_texts_tr = tok.texts_to_matrix(x_train,mode='count')
mat_texts_tst = tok.texts_to_matrix(x_test,mode='count')

In [408]:
x_train[:3], y_train[:]

(['congrats nokia numbr video camera phone is your call numbr calls cost numbrppm ave call numbrmins vary from mobiles numbr close numbr post bcmnumbr ldn wcnumbrnnumbrxx',
  'on ma way to school can you pls send me ashley s number',
  'money i have won wining number numbr wot do i do next'],
 array([1, 0, 1, ..., 1, 1, 0]))

In [409]:
tags[:]

array([0, 0, 0, ..., 1, 1, 1])

In [410]:
# max_len = 500
# x_train = tok.texts_to_sequences(x_train)
# x_test = tok.texts_to_sequences(x_test)
# cnn_texts_mat = sequence.pad_sequences(x_train,maxlen=max_len)
# max_len = 500
# cnn_texts_mat_tst = sequence.pad_sequences(x_test,maxlen=max_len)

In [411]:
model = Sequential()
model.add(Dense(10, activation='relu', input_shape=(num_max,)))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(3, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.summary()
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc',metrics.binary_accuracy])



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 10)                110       
_________________________________________________________________
dropout_38 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_52 (Dense)             (None, 10)                110       
_________________________________________________________________
dropout_39 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_53 (Dense)             (None, 5)                 55        
_________________________________________________________________
dropout_40 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_54 (Dense)             (None, 3)                 18        
__________

In [437]:
model.fit(mat_texts_tr,y_train,batch_size=10,epochs=100,verbose=1,validation_split=0.3)

Train on 1100 samples, validate on 472 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


<keras.callbacks.History at 0x7f4a6c365e80>

In [438]:
model.evaluate(mat_texts_tst,y_test)



[0.25716722611870085, 0.9142857146263123, 0.5]

In [439]:
checktext = text_messages[2]

In [440]:
text_messages

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
6       Even my brother is not like to speak with me. ...
                              ...                        
5537    Want explicit SEX in 30 secs? Ring 02073162414...
5540    ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547    Had your contract mobile 11 Mnths? Latest Moto...
5566    REMINDER FROM O2: To get 2.50 pounds free call...
5567    This is the 2nd time we have tried 2 contact u...
Name: 1, Length: 1747, dtype: object

In [445]:
processed = pd.Series(text_messages[-5:]).str.replace(r"^.+@[^\.].*\.[a-z]{2,}$", "emailaddr")
processed = processed.str.replace(r"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$", "webaddrs")
processed = processed.str.replace(r"$", "moneysymb")
processed = processed.str.replace(r"/^[0-9]\d{2,4}-\d{6,8}$/", "phonenumbr")
processed = processed.str.replace(r"\d+(\.\d+)?", "numbr")
processed = processed.str.replace(r"[^\w\d\s]", " ")
processed = processed.str.replace(r"\s+", " ")

processed = processed.str.replace(r"^\s+|\s+?$", "")
processed = processed.str.lower()


texts = []
for x in processed:
    try:
        review = re.sub('[^a-zA-Z]',  x)  
        review = review.lower() 
        review = review.split()  
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review 
                    if not word in set(stopwords.words('english'))]  

        review = ' '.join(review)
        texts.append(review)
    except:
        texts.append(x)
        
texts = [str(x) for x in texts]

In [446]:
num_max = 10
le = LabelEncoder()
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)

In [451]:

check_text_tr = tok.texts_to_matrix(texts,mode='count')

pred = model.predict(check_text_tr, verbose=1)
classes = ["ham", "spam"]
all_result = []
for i,p in enumerate(pred):
    print("Text: ",texts[i])
    print("Predict: ",classes[np.argmax(p)])
    print("------------------")

Text:  want explicit sex in numbr secs ring numbr now costs numbrp min gsex pobox numbr wcnumbrn numbrxx
Predict:  spam
------------------
Text:  asked numbrmobile if numbr chatlines inclu in free mins india cust servs sed yes lnumbrer got mega bill numbr dont giv a shit bailiff due in days i o numbr numbr want numbr
Predict:  spam
------------------
Text:  had your contract mobile numbr mnths latest motorola nokia etc all free double mins text on orange tariffs text yes for callback no to remove from records
Predict:  ham
------------------
Text:  reminder from onumbr to get numbr pounds free call credit and details of great offers pls reply numbr this text with your valid name house no and postcode
Predict:  spam
------------------
Text:  this is the numbrnd time we have tried numbr contact u u have won the numbr pound prize numbr claim is easy call numbr nownumbr only numbrp per minute bt national rate
Predict:  spam
------------------


In [448]:
# check_text_tr

In [435]:
print(text_messages[-2:-1])

5566    REMINDER FROM O2: To get 2.50 pounds free call...
Name: 1, dtype: object


In [420]:
tags

array([0, 0, 0, ..., 1, 1, 1])