In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Flatten


import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("/content/drive/MyDrive/NLP(Classes)/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Feature and target
X = df["Message"]
y = df["Category"]

**DIVIDING INTO TRAINING AND TESTING DATA**

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

**TRAIN DATA PRE-PROCESSING**

In [5]:
# Tokenization

tok = Tokenizer()
tok.fit_on_texts(X_train)

In [6]:
# vocabulary
tok.index_word

{1: 'i',
 2: 'to',
 3: 'you',
 4: 'a',
 5: 'the',
 6: 'u',
 7: 'and',
 8: 'is',
 9: 'in',
 10: 'me',
 11: 'my',
 12: 'for',
 13: 'your',
 14: 'it',
 15: 'of',
 16: 'call',
 17: 'have',
 18: 'that',
 19: 'on',
 20: '2',
 21: 'are',
 22: 'now',
 23: 'so',
 24: 'but',
 25: 'not',
 26: 'can',
 27: 'if',
 28: 'or',
 29: 'ur',
 30: 'at',
 31: 'with',
 32: 'get',
 33: 'do',
 34: 'will',
 35: 'be',
 36: "i'm",
 37: 'no',
 38: 'just',
 39: 'this',
 40: 'we',
 41: '4',
 42: 'when',
 43: 'up',
 44: 'ok',
 45: 'go',
 46: 'from',
 47: 'gt',
 48: 'lt',
 49: 'how',
 50: 'out',
 51: 'free',
 52: 'all',
 53: 'what',
 54: 'then',
 55: 'got',
 56: 'good',
 57: 'like',
 58: 'know',
 59: 'come',
 60: 'time',
 61: 'its',
 62: 'am',
 63: 'was',
 64: 'only',
 65: 'day',
 66: 'love',
 67: 'want',
 68: 'text',
 69: 'he',
 70: 'there',
 71: 'send',
 72: 'by',
 73: 'going',
 74: 'as',
 75: 'ü',
 76: 'about',
 77: 'today',
 78: "i'll",
 79: 'txt',
 80: 'one',
 81: 'need',
 82: 'stop',
 83: 'back',
 84: 'lor',
 85:

In [7]:
vocab_len = len(tok.index_word)
vocab_len

7382

In [8]:
# text to sequence
train_sequence = tok.texts_to_sequences(X_train)

In [9]:
# document length
doc_len=[]
for doc in train_sequence:
    doc_len.append(len(doc))    

In [10]:
max(doc_len)

189

In [11]:
np.quantile(doc_len,0.99)

51.00999999999976

In [12]:
max_len = 51

In [13]:
# padding
train_matrix = sequence.pad_sequences(train_sequence,maxlen=max_len)

In [14]:
train_matrix

array([[   0,    0,    0, ...,  111,  500, 1013],
       [   0,    0,    0, ...,   74,   13, 3494],
       [   0,    0,    0, ...,   52,   39,  850],
       ...,
       [   0,    0,    0, ...,  121,  741, 7381],
       [   0,    0,    0, ..., 1790, 7382, 1919],
       [   0,    0,    0, ...,  267,   31,   10]], dtype=int32)

**TEST DATA PRE-PROCESSING**

In [15]:
# Test data preprocessing
test_sequence = tok.texts_to_sequences(X_test)
test_matrix = sequence.pad_sequences(test_sequence,maxlen=max_len)

In [16]:
test_matrix

array([[   0,    0,    0, ...,   72,    5,  719],
       [   0,    0,    0, ...,  142,   10, 1592],
       [   0,    0,    0, ..., 5282, 2962,   69],
       ...,
       [   0,    0,    0, ...,    0,  205, 1753],
       [   0,    0,    0, ...,  171,   12,    5],
       [   0,    0,    0, ...,   78,   16,   90]], dtype=int32)

**Neural Network**

In [17]:
model = Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Flatten())
model.add(Dense(64,activation="tanh"))
model.add(Dense(64,activation="tanh"))
model.add(Dense(1,activation="sigmoid"))

In [18]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [20]:
model.fit(train_matrix,y_train,epochs=5,batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe762a91250>

In [21]:
y_pred = model.predict(test_matrix)

In [22]:
y_pred = np.where(y_pred >= 0.5,1,0)

In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1442
           1       0.99      0.92      0.95       230

    accuracy                           0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672

