# LSTM and Bi-LSTM Networks for Text Classification 


*   Import the necessary packages
*   Load the data
*   Preprocess the data
*   Perform train, validation and test splits
*   Import pre-trained GloVe embeddings and construct the embedding matrix
*   Define the model
*   Initialize callbacks like early stopping, checkpoint
*   Fit the model on the train set
*   Evaluate the model accuracy on test set
*   Plot training and validation accuracy and loss

### Appendix

*   Code snippet used for extracting different raw data into csv formats















In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

### Import the necessary packages

In [None]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical
from keras import backend as K
from keras import initializers, regularizers, constraints

#nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import string
from string import punctuation
from nltk.corpus import stopwords
#from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#sklearn
from sklearn.model_selection import StratifiedShuffleSplit

#others
import gzip
import numpy as np
import pandas as pd
import regex as re


### Data Preprocessing Function

In [None]:
'''
This function is responsible for preprocessing the reviews before being fed to train the model
1. Remove punctuation
2. Convert all texts to lowercase
3. Remove english stop words
4. Remove special characters
5. Replace common abbreviations with their full forms
'''
def clean_text(text, remove_stopwords=True, stem_words=False, expand_abbreviations=True):
    
    ## Removing the punctuations (for python 3.0+ versions)
    text = text.translate(string.punctuation)
    
    # If using python 2.7, uncomment the below line to remove punctuations
    #text = ' '.join([word.strip(string.punctuation) for word in text.split() if word.strip(string.punctuation) is not ""])
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove english stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [word for word in text if not word in stops and len(word) >= 3]
    
    text = " ".join(text)
    ## Clean the text by removing special characters and replacing popular abbreviations with actual words
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)    
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\omg", " oh my god ", text)
    text = re.sub(r"\'lol", " laughing out loud ", text)
    text = re.sub(r"\'ok", " okay ", text)
    
    # Stemming
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return text


### Indicate dataset type

In [None]:
dataset_type = "imdb"
#dataset_type = "agnews"
#dataset_type = "amazon"

data_path = 'drive/My Drive/'

### Loading Dataset (IMDB)

In [None]:
if dataset_type == "imdb":
    train_path = data_path + "Data/IMDB/train.csv"
    test_path = data_path + "Data/IMDB/test.csv"

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)


### Loading Dataset (AG-NEWS)

In [None]:
if dataset_type == "agnews":
    train_path = data_path + "Data/AG_News/train.csv"
    test_path = data_path + "Data/AG_News/test.csv"
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)


    # All labels start from 1 hence subtracting it by 1 so as to perform one-hot encoding later
    train['label'] = train['label'] - 1
    test['label'] = test['label'] - 1

### Loading Dataset (Amazon)

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def load_amazon(path):
                
    df = getDF(path + 'reviews_Toys_and_Games_5.json.gz')
    return df

if dataset_type == "amazon":
    data = load_amazon(data_path + "Data/Amazon/")
    data.columns = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'text', 'label',
           'summary', 'unixReviewTime', 'reviewTime']


    stratsplit = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
    for train_index, test_index in stratsplit.split(data, data['label']):
        train = data.loc[train_index]
        test = data.loc[test_index]
        break

    # All labels start from 1 hence subtracting it by 1 so as to perform one-hot encoding later
    train['label'] = train['label'] - 1
    test['label'] = test['label'] - 1

### Perform data preprocessing

In [None]:
train = train[train.text.apply(lambda x: x != "")]

# calling the preprocessing functiond defined above on all the reviews
train['text'] = train['text'].map(lambda x: clean_text(x))
test['text'] = test['text'].map(lambda x: clean_text(x))

### Prepare train, validation and test sets

In [None]:
y_train = to_categorical(np.asarray(train['label']))
y_test = to_categorical(np.asarray(test['label'])) 

num_classes = y_test.shape[1]

### Configure parameters

In [None]:
VOCABULARY_SIZE = 15000

# Since amazon reviews are huge and richer, we use 20000 words in the vocabulary.
if dataset_type == "amazon":
    VOCABULARY_SIZE = 20000

MAX_REVIEW_LENGTH = 100
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 64
EPOCHS = 20


### Tokenize and Padding Sequences

In [None]:
tokenizer = Tokenizer(num_words = VOCABULARY_SIZE)
tokenizer.fit_on_texts(train['text'])

sequences_train = tokenizer.texts_to_sequences(train['text'])
sequences_test = tokenizer.texts_to_sequences(test['text'])

# Padding to make all sentences of same length
data_test = pad_sequences(sequences_test, maxlen = MAX_REVIEW_LENGTH)
data_train = pad_sequences(sequences_train, maxlen = MAX_REVIEW_LENGTH)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))


### Import GloVe pre-trained vectors

In [None]:
embeddings_index = dict()
f = open(data_path + 'Data/GloVe/glove.6B.100d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

### Create the embedding vectors

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((VOCABULARY_SIZE, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
    if index > VOCABULARY_SIZE - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector                       

### Define the Model

In [None]:
embedding_layer = Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_REVIEW_LENGTH, trainable=False)

model = Sequential()
model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, input_length=MAX_REVIEW_LENGTH, weights=[embedding_matrix], trainable=False))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(units = 100))
model.add(Dropout(rate = 0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Used for IMDB dataset which is a binary classification usecase
# model.add(Dense(num_classes, activation='sigmoid'))
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Bi-Directional LSTM Model (uncomment if you need to run this)

In [None]:
# embedding_layer = Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_REVIEW_LENGTH, trainable=False)

# model = Sequential()
# model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_DIM, input_length=MAX_REVIEW_LENGTH, weights=[embedding_matrix], trainable=False))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(100, return_sequences = True)))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(units = 100)))
# model.add(Dropout(rate = 0.2))
# model.add(Dense(num_classes, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Used for IMDB dataset which is a binary classification usecase
# model.add(Dense(num_classes, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Define callbacks: early stopping and checkpoints

In [None]:
checkpointer = ModelCheckpoint(data_path + 'model.{epoch:02d}.hdf5', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')

### Fit the compiled model

In [None]:
callbacks = model.fit(data_train, y_train, validation_split=VALIDATION_SPLIT, epochs=EPOCHS, batch_size = BATCH_SIZE, callbacks=[checkpointer, early_stopping])

### Save the final model and the history for plotting graphs later on

In [None]:
import pickle
model.save(data_path + "final_model.hdf5")
with open(data_path + 'trainValHistoryDict', 'wb') as file_pi:
    pickle.dump(callbacks.history, file_pi)

### Run this cell if you want to load an already trained model

In [None]:
model_path = data_path + "final_model.hdf5"
model = load_model(model_path)

### Evaluate the model on the test set

In [None]:
model.evaluate(data_test, y_test)

### Plot the accuracy and losses

In [None]:
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

num_epochs = 20
fig = plt.figure()
ax = plt.subplot(111)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.xticks(range(1, num_epochs + 1, 1))
y = range(1, num_epochs + 1)
c1 = plt.plot(y, np.squeeze(callbacks.history['loss']), color="teal", label="Training")
c2 = plt.plot(y, np.squeeze(callbacks.history['val_loss']), color="orange", label="Validation")
ax.legend()
plt.title("Train vs Validation Loss")
plt.savefig(data_path + "image.png")
plt.show()

## APPENDIX

### Code snippet that was used to extract raw IMDB data into csv

In [None]:
def load_data(path, output_path):
    
    indices = []
    text = []
    label = []

    i = 0 
    
    train_pos_path = os.path.join(path, "pos")
    train_neg_path = os.path.join(path, "neg")
    
    for filename in os.listdir(train_pos_path):
        data = open(os.path.join(train_pos_path, filename), 'r').read()
        print(data)
        indices.append(i)
        text.append(data)
        label.append("1")
        i = i + 1

    i = 0
    
    for filename in os.listdir(train_neg_path):
        data = open(os.path.join(train_neg_path, filename), 'r').read()
        indices.append(i)
        text.append(data)
        label.append("0")
        i = i + 1

    Dataset = list(zip(indices, text, label))
	
    # We shuffle it as it is a sequence of postives followed by negatives currently. 
    np.random.shuffle(Dataset)

    df = pd.DataFrame(data = Dataset, columns=['id', 'text', 'label'])
    df.to_csv(output_path+"_Data", index=False, header=True)
    
    return df

### Code snippet that was used to extract raw Amazon review data into csv

In [None]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def load_amazon(path):
                
    df = getDF(path + 'reviews_Toys_and_Games_5.json.gz')
    return df