# Test Classification with LSTM

###Text classification is a machine learning technique that assigns a set of predefined categories to open-ended text.

###With their ability to capture long-term dependencies and handle sequential data, LSTM models offer improved accuracy in classifying text.



In [1]:
# importing the libraries (dependencies)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import tensorflow as tf
%matplotlib inlineLSTM stands for long short-term memory networks, used in the field of Deep Learning. It is a variety of recurrent neural networks (RNNs) that are capable of learning long-term dependencies, especially in sequence prediction problems.

##Dataset

####Kaggle - SMS Spam Collection Dataset
####https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [2]:
df = pd.read_csv('spam.csv', delimiter = ',', encoding = 'latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


###dropping off the unnecessary columns which contain NaN values
###The info() function provides a concise summary of a dataframe. It provides information about the datatype of the index, the datatype of the column, non-null values and memory usage

In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


###column: 'v1' contains classification of the text present in the second column
###column: 'v2' contains the text which has been classified as spam or ham

In [4]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


#Label Encoding
###Label Encoder: It encodes the target labels with value between 0 and n_classes-1.
###This transformer is used to encode the target values, not the input

In [13]:
X = df.v2  #X contains the text messages (input)
Y = df.v1  #Y contains the label corresponding to the text messages (target values)
le = LabelEncoder()
Y = le.fit_transform(Y)
print(le.classes_) #prints the two classes present, i.e. 'ham' and 'spam'
print(Y)  #prints a list of the label corresponding to each of the input input instances (or input text messages)
print(Y.shape)  #prints the total number of instances

['ham' 'spam']
[0 0 1 ... 0 0 0]
(5572,)


##Train and Test splits

###test_size = 0.15 means that 15% of the instances are put in the testing set, and 85% of the instances are put in the training set

In [14]:
# Split the dataset into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15)

#Process the data

1. Tokenize the data and convert the text to sequences.
2. Add padding to ensure that all sequences have the same length.
3. Many ways to choose max_len. Here, 150 is chosen as the max_len.



#Tokenization

###Tokenization is used in natural language processing to split paragraphs and sentences into smaller units that can be more easily assigned meaning.

In [24]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
print(tok)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)
print(sequences_matrix)
for i in range(len(sequences_matrix)):
    print(len(sequences_matrix[i]))

print(len(sequences_matrix))

<keras.preprocessing.text.Tokenizer object at 0x7ba57708bfa0>
[[  0   0   0 ...  18  13 791]
 [  0   0   0 ...  61   8 792]
 [  0   0   0 ...  16   3  91]
 ...
 [  0   0   0 ...  47  64  73]
 [  0   0   0 ...   4  16 370]
 [  0   0   0 ... 289  72  13]]
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
150
15

#LSTM

###LSTM stands for long short-term memory networks, used in the field of Deep Learning. It is a variety of recurrent neural networks (RNNs) that are capable of learning long-term dependencies, especially in sequence prediction problems.

#Word Embedding

###Word Embeddings in NLP is a technique where individual words are represented as real-valued vectors in a lower-dimensional space and captures inter-word semantics. Each word is represented by a real-valued vector with tens or hundreds of dimensions.

In [28]:
def lstm():
    inputs = Input(name = 'Inputs', shape = [max_len]) #Input() used to instantiate keras tensor
    layer = Embedding(max_words, 50, input_length = max_len)(inputs) #a word embedding of size 50 has been created 50 here is the output dimension of the embedding
    layer = LSTM(64)(layer)
    layer = Dense(256, name = 'Fully_Connected_layer')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    #now, the output layer contains just one node as we are performing a binary classification
    #sigmoid activation chosen for the output layer
    layer = Dense(1, name = 'Output_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs = inputs, outputs = layer)
    return model




In [29]:
model = lstm()
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = RMSprop(), metrics = ['accuracy'])


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 150, 50)           50000     
                                                                 
 lstm_2 (LSTM)               (None, 64)                29440     
                                                                 
 Fully_Connected_layer (Dens  (None, 256)              16640     
 e)                                                              
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                             

#Training the LSTM model

###Early Stopping has been used, and the validation loss has been monitored for the same.

In [30]:
model.fit(sequences_matrix, Y_train, batch_size = 128, epochs = 10, validation_split = 0.2, callbacks = [EarlyStopping(monitor = 'val_loss')])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7ba5747afc10>

#Processing the testing set

###Texts are converted to sequences, and the sequence matrix is created with the padding added onto each of the sequences in the matrix

In [31]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen = max_len)

In [32]:
accr = model.evaluate(test_sequences_matrix, Y_test)



In [34]:
print('Evaluation on the Test set:\n Loss = {:0.3f}\n Accuracy = {:0.3f}'.format(accr[0], accr[1]))

Evaluation on the Test set:
 Loss = 0.060
 Accuracy = 0.987


#Results
###An accuracy of 98.7% has been achieved!

### LSTMs perform incredibly well on such tasks, as they are able to capture the long-term dependencies present in sequential data, and are therefore able to intelligently classify the textual data as spam or otherwise