In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
import tensorflow
import keras

In [2]:
msgs = pd.read_csv('spam1.csv', encoding='cp1252')

In [3]:
msgs.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [4]:
msgs.isnull().sum()

v1               0
v2               0
Unnamed: 2    6720
Unnamed: 3    6760
Unnamed: 4    6768
dtype: int64

In [5]:
msgs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6776 entries, 0 to 6775
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          6776 non-null   object
 1   v2          6776 non-null   object
 2   Unnamed: 2  56 non-null     object
 3   Unnamed: 3  16 non-null     object
 4   Unnamed: 4  8 non-null      object
dtypes: object(5)
memory usage: 264.8+ KB


In [6]:
msgs.shape

(6776, 5)

In [7]:
msgs = msgs.iloc[:,[0,1]]

In [8]:
msgs.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
msgs.rename(columns={'v1':'Label', 'v2':'Message'}, inplace=True)

In [10]:
msgs.replace({'Label':{'ham':0,'spam':1}}, inplace=True)

In [11]:
msgs.Message = msgs.Message.str.lower()

In [12]:
msgs.head()

Unnamed: 0,Label,Message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
msgs_train, msgs_test = train_test_split(msgs, test_size = 0.2)

In [15]:
msgs_train.shape

(5420, 2)

In [16]:
msgs_train_y = msgs_train.iloc[:,0]
msgs_test_y = msgs_test.iloc[:,0]

In [17]:
msgs_train_x = msgs_train.iloc[:,1]
msgs_test_x = msgs_test.iloc[:,1]

In [18]:
msgs_test_x.shape

(1356,)

In [19]:
msgs_train_y

1935    0
2126    0
6666    1
6194    0
1157    0
6241    0
1581    0
1472    0
1318    0
4894    0
4005    0
1115    0
713     0
874     0
4447    0
1748    1
4823    0
3507    0
4833    1
2760    0
1873    0
5396    0
1668    0
6160    0
854     0
1196    1
6111    0
2869    0
3638    0
4224    0
5542    0
1008    0
3177    0
4800    1
3484    0
2058    0
5417    0
22      0
3191    0
3675    0
6224    0
4029    0
5823    0
3773    0
3050    0
1752    0
2448    0
1620    0
658     1
4475    0
4795    0
5768    0
5515    0
3867    1
5510    0
2841    1
466     0
617     0
2357    1
555     0
2383    0
355     0
38      0
4393    0
1627    1
1279    0
1995    0
5483    1
3710    0
3869    0
6460    0
3373    1
4283    0
5607    0
4581    0
1782    0
4818    0
5447    0
3496    0
2336    0
1459    0
662     0
4478    0
684     0
4183    0
2731    0
582     1
5411    0
3658    0
4196    0
3012    0
3250    0
622     0
2817    0
6092    0
2159    0
5021    1
3804    0
4706    0
4692    0


In [20]:
from tensorflow.keras.utils import to_categorical    # make data 2D from 1D

In [21]:
msgs_train_y = to_categorical(msgs_train_y)     
msgs_train_y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [22]:
#One hot encoding : add columns using each record to convert data into numeric type (offers an alternative to label encoder)

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, LSTM, Embedding
from tensorflow.keras.optimizers import Adam

In [24]:
max_num_words = 10000
# how many unique words you need to consider from the entire corpus/ document

seq_len = 75
# words which will be passed in batches

embedding_size = 100
# embedding is to find similar type of 100 words

# ALL 3 ARE HYPERPARAMETER

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer  # is used to assign a number or a token for each word
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
tokenizer = Tokenizer(num_words=max_num_words)

In [27]:
tokenizer.fit_on_texts(msgs.Message)
msgs_train_x = tokenizer.texts_to_sequences(msgs_train_x)
msgs_train_x = pad_sequences(msgs_train_x, maxlen = seq_len)

In [28]:
msgs_train_x[475]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0, 1884,   17,    4,  405,   30,    5, 1302, 1885,
          8,  721, 2172, 1115,   25,    4,  508, 2575,   36, 2173, 3216,
          4, 1302, 1885, 3217, 2174,    3,   34,    4,  508])

In [29]:
msgs_test_x = tokenizer.texts_to_sequences(msgs_test_x)
msgs_test_x = pad_sequences(msgs_test_x, maxlen = seq_len)

In [30]:
msgs_test_x[5]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,  57, 177, 484,  15,  11, 350])

In [31]:
model = Sequential()                             # to initialize the model
model.add(Embedding(input_dim = max_num_words, 
                    input_shape = (seq_len,),
                    output_dim = embedding_size))
                

  super().__init__(**kwargs)


In [32]:
model.add(LSTM(3))
model.add(Dense(2, activation='softmax'))

adam = Adam(learning_rate = 0.001)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [33]:
model.fit(msgs_train_x, msgs_train_y, epochs=7, validation_split=.2)

Epoch 1/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - accuracy: 0.8756 - loss: 0.5425 - val_accuracy: 0.9659 - val_loss: 0.2940
Epoch 2/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.9786 - loss: 0.2422 - val_accuracy: 0.9834 - val_loss: 0.1492
Epoch 3/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.9936 - loss: 0.1127 - val_accuracy: 0.9908 - val_loss: 0.0908
Epoch 4/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9946 - loss: 0.0617 - val_accuracy: 0.9899 - val_loss: 0.0686
Epoch 5/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9986 - loss: 0.0373 - val_accuracy: 0.9899 - val_loss: 0.0580
Epoch 6/7
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.9989 - loss: 0.0261 - val_accuracy: 0.9889 - val_loss: 0.0567
Epoch 7/7
[1m136/136[0m [

<keras.src.callbacks.history.History at 0x2287e104130>

In [34]:
pred = model.predict(msgs_test_x)
pred

[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step


array([[0.9456103 , 0.0543897 ],
       [0.9897782 , 0.01022178],
       [0.98910517, 0.0108948 ],
       ...,
       [0.989319  , 0.01068098],
       [0.02641934, 0.9735807 ],
       [0.98739713, 0.01260286]], dtype=float32)

In [35]:
len(pred)

1356

In [36]:
len(msgs_test_x)

1356

In [37]:
pred_val = np.argmax(pred, axis=1)
pred_val

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [39]:
cm = confusion_matrix(msgs_test_y, pred_val)
cm

array([[1179,    0],
       [   6,  171]], dtype=int64)

In [40]:
accuracy_score(msgs_test_y, pred_val)*100

99.5575221238938

In [41]:
model.summary()