**Import Library**

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU , Flatten,Input,Activation
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np 
import keras.utils as ku
from keras.models import Sequential
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
#pip install -U keras
#pip install --upgrade gensim

**Import Dataframe**

In [0]:
df= pd.read_csv("SMSSpam.csv",sep='\t',header=None, error_bad_lines=False)

In [4]:
 # Rename Header

df = df.rename(  columns={0: "label", 1: "msg"})   
df

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [0]:
df["label"] = df["label"].astype('category')

In [0]:
df["label"] = df["label"].cat.codes

In [8]:
df.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


**Text Cleaning Function**

In [0]:
import string, os
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

In [0]:
msg_list = list(df.msg.values)
corpus = [clean_text(x) for x in msg_list]

In [11]:
corpus[:5]

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s',
 'u dun say so early hor u c already then say',
 'nah i dont think he goes to usf he lives around here though']

In [12]:
#Tokenizaton

token = []
for i in corpus:
  token.append(i.split(" "))
len(token)

5572

In [13]:
# Maximum Length of String

max2 = len(max(token, key=len))
print(max2)

171


In [0]:
# padding with "Null" for same length

for i in range(len(token)):
  for j in range(len(token[i])):
    if len(token[i])<max2:
      c = max2-len(token[i])
      for k in range(c):
        token[i].append("Null")


**Import Word2Vec Model**

In [0]:
import gensim 
from gensim.models import Word2Vec 

In [0]:
# 150 Dimension
model_vec = Word2Vec(token , size=150)

In [17]:
words = list(model_vec.wv.vocab)
print(len(words))   # No of Vocab

1831


In [0]:
embed = np.zeros((len(token),max2,150) ,dtype = np.float32)   # Embedded Word Matrix

In [0]:
for i in range (len(corpus)):
  a = corpus[i].split(" ")
  for j in range(len(a)):
    try:
      embed[i,j] = model_vec.wv[a[j]]
    except (KeyError):
      embed[i,j] = model_vec.wv['Null']
      pass
    

In [20]:
model_vec.wv['Null']

array([ 0.19803025, -0.04943343, -0.11577769, -0.49531892, -0.74189734,
        1.1401322 ,  0.78172415, -1.0760581 , -0.74337864, -0.18599454,
        0.21727067, -0.83854115,  0.18502688,  0.04029225,  0.25940868,
        0.15489292, -0.21173953, -0.4675542 , -0.49507388, -0.89451367,
        0.39394048,  0.57299936, -0.15609244,  0.49786124,  0.7967522 ,
        0.2826063 , -0.07513314,  0.8214669 , -0.37990174,  0.13561407,
        0.39489883, -0.8111563 , -1.3008857 , -1.3321834 ,  1.9850266 ,
        0.34333277, -0.12024029,  0.01970369, -0.8838773 ,  0.67437106,
        0.8209392 ,  0.6365548 , -1.9486494 ,  0.35969758,  0.50224406,
        0.8282766 , -0.2811917 ,  0.30898067,  0.97236764, -0.36320263,
       -1.8477103 , -0.51760256,  0.0196229 ,  2.082526  , -1.4711035 ,
       -0.68388146,  0.59958476, -0.79380107,  0.12357409, -1.0096859 ,
       -0.136771  ,  1.7882782 , -1.1662852 ,  1.1101652 , -2.4116282 ,
        0.9214607 ,  1.4276282 , -2.296773  , -1.1139022 ,  0.62

In [0]:
y = df.label.values
y = y.reshape(-1,1)

x_train, x_test, y_train, y_test = train_test_split(embed, y, test_size=0.30, random_state=42)

In [22]:
x_train[0].shape

(171, 150)

In [23]:
embed.shape

(5572, 171, 150)

**Apply Model**

In [0]:
model = Sequential()
model.add(LSTM(150 ,return_sequences=True , dropout = 0.3))   
model.add(Dense(units=len(words)))
model.add(LSTM(64, dropout = 0.2))
model.add(Dense(32))
model.add(Dense(1,activation='sigmoid'))

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
model.fit(x_train, y_train,epochs=15,validation_data=(x_test, y_test) ,verbose=1)

Train on 3900 samples, validate on 1672 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7f64fb0bd358>

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 171, 150)          180600    
_________________________________________________________________
dense_1 (Dense)              (None, 171, 1831)         276481    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                485376    
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 944,570
Trainable params: 944,570
Non-trainable params: 0
_________________________________________________________________


In [31]:
score, acc = model.evaluate(x_test, y_test ,verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3941223629924099
Test accuracy: 0.8660287261009216


 **Thank You**