In [75]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
#https://nlpforhackers.io/keras-intro/

/kaggle/input/sms-spam-collection-dataset/spam.csv


In [76]:
from keras.utils.vis_utils import plot_model

In [77]:
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from keras.models import Sequential
from keras.layers import Dense,LSTM, Embedding

In [78]:
def clean_review(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
 
    return text

In [79]:
data = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='latin1')

In [80]:
print(data.shape)
data.head()

(5572, 5)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [81]:
data = data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis = 1)

In [82]:
data.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [83]:
data.v1 = LabelEncoder().fit_transform(data.v1)
data.head()
data.v1.value_counts()


0    4825
1     747
Name: v1, dtype: int64

In [84]:
data['v2'] = data['v2'].apply(clean_review)
x_train, x_text, y_train, y_test = train_test_split(data['v2'], data['v1'], test_size=0.2)

### Count vectorizer

In [85]:
cntvec = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
x_train_onehot = cntvec.fit_transform(x_train)

In [86]:
len(cntvec.get_feature_names())

2291

### Keras Simple
Here’s how to create a simple, 2 layer network. The first layer (which actually comes after an input layer) is called the hidden layer, and the second one is called the output layer

In [87]:
model = Sequential()
model.add(Dense(units = 500, 
               activation = 'relu',
               input_dim = len(cntvec.get_feature_names()) ))

model.add(Dense(units = 1, activation= 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model.summary()
#plot_model(model, show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 500)               1146000   
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 501       
Total params: 1,146,501
Trainable params: 1,146,501
Non-trainable params: 0
_________________________________________________________________


In [88]:
model.fit(x_train_onehot[:-100], 
          y_train[:-100], 
          epochs=5, 
          batch_size=128, 
          verbose=1, 
          validation_data=(x_train_onehot[-100:], y_train[-100:]))

Train on 4357 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f717b191160>

In [89]:
scores = model.evaluate(cntvec.transform(x_text), y_test, verbose=1)
print("Accuracy:", scores[1])  # Accuracy: 0.875

Accuracy: 0.9901345291479821


> ### LSTM

In [90]:
tokenize

<function sklearn.feature_extraction.text.VectorizerMixin.build_tokenizer.<locals>.<lambda>(doc)>

In [91]:
word2idx = {word: idx for idx, word in enumerate(cntvec.get_feature_names())}
tokenize = cntvec.build_tokenizer()
preprocess = cntvec.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes
 
print(to_sequence(tokenize, preprocess, word2idx, "This is an important test!"))  # [2269, 4453]
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in x_train]
print(X_train_sequences[0])

[1014, 1970]
[1417, 924, 305, 1156]


In [92]:
# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)
 
from keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(cntvec.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)
print(X_train_sequences[0])
print(X_train_sequences[1000])
 

MAX_SEQ_LENGHT= 59
[2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 1417
  924  305 1156]
[2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291 2291
 2291 2291  814 2032 1530 2028  394 1358 1600  860 1743  711 2183 1971
 1475 2096   25]


In [93]:
model = Sequential()
model.add(Embedding(len(cntvec.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(LSTM(64))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 59, 64)            146688    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 179,777
Trainable params: 179,777
Non-trainable params: 0
_________________________________________________________________
None


In [97]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=5, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

Train on 4357 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f717ae182b0>

In [98]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in x_text]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)
 

In [99]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) # 0.989

Accuracy: 0.989237668161435
