#### Importing the required library

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import sklearn.metrics as m

#### Downloading required packages for nltk

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

#### Loading the data

In [3]:
data=pd.read_csv('spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Dropping unnecessary columns

In [4]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [5]:
data.shape

(5572, 2)

#### Printing SMS column and labels column

In [6]:
sms_body = data.iloc[:,[1]]['v2']
sms_body

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [7]:
labels = data.iloc[:,[0]]['v1']
labels

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [8]:
from sklearn.preprocessing import LabelEncoder

#### Applying `LabelEncoder` to converting the labels into a numeric form so as to convert them into the machine-readable form. Machine learning algorithms can then decide in a better way how those labels must be operated. 

In [9]:
le=LabelEncoder()
labels=le.fit_transform(labels)

#### Counting the number of classes in the Label column

In [10]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [11]:
labels

array([0, 0, 1, ..., 0, 0, 0])

In [12]:
from keras.utils.np_utils import to_categorical
labels=to_categorical(labels)

In [13]:
labels

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [14]:
import re

#### Using `WordNetLemmatizer` to establish structured semantic relationships between words

In [15]:
lemma=WordNetLemmatizer()

#### Convering text in lowercase

In [16]:
sentences=[]
for j in range(0,len(sms_body)):
  s=re.sub('[^A-Za-z]',' ',sms_body[j])
  s=s.lower()
  words=word_tokenize(s)
  words=[lemma.lemmatize(i) for i in words if i not in stopwords.words('english')]
  s=' '.join(words)
  sentences.append(s)

In [17]:
sentences

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gon na home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw

#### Importing `one_hot` encoder to ensures that machine learning does not assume that higher numbers are more important. For example, the value '8' is bigger than the value '1', but that does not make '8' more important than '1'.

In [18]:
from keras.preprocessing.text import one_hot

In [19]:
one_hot_sent=[one_hot(i,10000) for i in sentences]

In [20]:
maxi=[len(i) for i in one_hot_sent]

In [21]:
max(maxi)

77

#### Importing `pad_sequences` to ensure that all sequences in a list have the same length.

In [22]:
from keras.utils import pad_sequences

In [23]:
padsequences=pad_sequences(one_hot_sent,maxlen=80)

In [24]:
padsequences

array([[   0,    0,    0, ..., 5779, 9644,  999],
       [   0,    0,    0, ..., 5314, 9953, 8739],
       [   0,    0,    0, ..., 4523, 7501, 9330],
       ...,
       [   0,    0,    0, ..., 9655,    3, 6589],
       [   0,    0,    0, ..., 8060, 9953, 2029],
       [   0,    0,    0, ..., 8818, 4403, 7783]], dtype=int32)

In [25]:
feature_train,feature_test,label_train,label_test=train_test_split(padsequences,labels,test_size=0.2,random_state=7)

#### Since LSTMs can learn long-term connections between data time steps, they are frequently used to learn, analyse, and categorise sequential data. This is why I chose the LSTM model in this situation. Additionally, unlike traditional feed-forward neural networks, LSTM incorporates feedback connections.

In [26]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,Dropout,Flatten

In [27]:
model=Sequential()
model.add(Embedding(10000,64,input_length=80))
model.add(LSTM(100))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss='binary_crossentropy',metrics=['accuracy'],optimizer='adam')

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 64)            640000    
                                                                 
 lstm (LSTM)                 (None, 100)               66000     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 706,202
Trainable params: 706,202
Non-trainable params: 0
_________________________________________________________________


#### Fitting the LSTM model with 20 epochs

In [29]:
model.fit(feature_train,label_train,epochs=20,validation_data=(feature_test,label_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2a2cd6e0a0>

In [30]:
label_pred=model.predict(feature_test)



In [31]:
label_pred

array([[9.9999636e-01, 3.9036404e-06],
       [9.7993845e-01, 1.4308366e-02],
       [9.9999803e-01, 1.9963663e-06],
       ...,
       [9.9999619e-01, 4.2937531e-06],
       [9.9999535e-01, 4.6874529e-06],
       [9.9999791e-01, 2.1533333e-06]], dtype=float32)

In [32]:
label_pred_=[np.argmax(i,axis=0) for i in label_pred]

In [33]:
label_pred_[0:5]

[0, 0, 0, 0, 0]

In [34]:
label_test_=[np.argmax(i,axis=0) for i in label_test]

In [35]:
label_test_[0:5]

[0, 0, 0, 0, 0]

#### using test data and expected data to forecast the accuracy score

In [36]:
m.accuracy_score(label_test_,label_pred_)

0.9847533632286996

#### Printing the `classification_report` where `precision` measures how near the calculated results are to one another. `recall` is  the ratio between the numbers of Positive samples correctly classified as Positive to the total number of Positive samples. `f1-score` measure performance of binary classification, but extensions to multi-class classifications exist. And `support` is the total no of element.

In [37]:
print(m.classification_report(label_test_,label_pred_))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       970
           1       0.96      0.92      0.94       145

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



#### Calculating the `confusion_matrix` for test and prediction data

In [38]:
print(m.confusion_matrix(label_test_,label_pred_))

[[964   6]
 [ 11 134]]
