# HAN for Text Classification


*   Load Libraries
*   Connect to Google Drive
*   Code to clean the text
*   Load Data
*   Set Parameters
*   Clean and prepare train and test data
*   Load GloVe embedding model
*   Attention Layer Implementation
*   Model Architecture
*   Fit Model
*   Store Model
*   Evaluate Model




# Load all Libraries and download nltk packages

In [None]:
# All Required imports
import string
import pandas as pd
import gzip
import pandas as pd
import numpy as np
import string
import re
import pickle

import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D, MaxPooling1D, TimeDistributed
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, SpatialDropout1D, Layer, Embedding, Bidirectional, GRU
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Sequential
from keras.metrics import top_k_categorical_accuracy
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras import initializers
from keras import regularizers, constraints
from keras.callbacks import ModelCheckpoint

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import tensorflow as tf

from sklearn.model_selection import StratifiedShuffleSplit

from google.colab import drive

# Download nltk packages for text cleaning
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Connect to Google Drive

Use the URL to connect to google drive and give premission for colab to access Goole drive.

In [None]:
# Loading Google Srive and checking GPU access
drive.mount('/content/drive')
tf.test.gpu_device_name()

# Code to Clean the raw text

In [None]:
# Text cleaning code
def clean_text(text):
    '''
    This function is responsible for preprocessing the reviews before being fed to train the model
    1. Remove punctuation
    2. Convert all texts to lowercase
    3. Remove english stop words
    4. Remove special characters
    5. Replace common abbreviations with their full forms
    '''
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

# Load Data

Uncomment the dataset that you want to run.

In [None]:
# Loading Dataset
#dataset_name = 'IMDB'
#dataset_name = 'AG_News'
dataset_name = 'Amazon'


if dataset_name == 'IMDB':
  train_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/train.csv")
  test_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/test.csv")
  
if dataset_name == 'AG_News':
  train_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/train.csv")
  test_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/test.csv")
  train_df['label'] = train_df['label'] -1
  test_df['label'] = test_df['label'] -1
  
if dataset_name == 'Amazon':
  df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/Amazon_Data.csv")
  stratsplit = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
  for train_index, test_index in stratsplit.split(df, df['label']):
    train_df = df.loc[train_index]
    test_df = df.loc[test_index]
    train_df.dropna(subset = ['text'],inplace=True)
    test_df.dropna(subset = ['text'],inplace=True)
    train_df.reset_index(drop=True,inplace=True)
    test_df.reset_index(drop=True,inplace=True)
    train_df['label'] = train_df['label'] -1
    test_df['label'] = test_df['label'] -1
    break

# Setting up the parameters for embedding and HAN

In [None]:
# Setting Parameters for HAN models
MAX_SENTENCE_NUM = 100
MAX_WORD_NUM = 100
MAX_FEATURES = 200000 
n_rows=train_df.shape[0]
n_classes = train_df.label.nunique()

MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# Creating train and test set

In [None]:
'''
Tokenize the sentences using keras preprocessing for Training Data,
then use those tokens to create the data for training and testing
'''

# preparing Train data

reviews = []
labels = []
texts = []
for idx in range(train_df.text.shape[0]):
    s = train_df['text'].iloc[idx]
    s = ' '.join([word.strip(string.punctuation) for word in s.split() if word.strip(string.punctuation) is not ""])
    text = clean_text(s)
    texts.append(text)
    sentences = sent_tokenize(text)
    reviews.append(sentences)

    labels.append(train_df.label[idx])
    
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS,lower=True, oov_token=None)
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
print('Number of tokens: ' + str(len(word_index)))

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)




indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data
y_train = labels

# preparing Test data
test_reviews = []
test_labels = []
test_texts = []

for idx in range(test_df.text.shape[0]):
    s = test_df['text'].iloc[idx]
    s = ' '.join([word.strip(string.punctuation) for word in s.split() if word.strip(string.punctuation) is not ""])
    text = clean_text(s)
    test_texts.append(text)
    sentences = sent_tokenize(text)
    test_reviews.append(sentences)

    test_labels.append(test_df.label[idx])


test_data = np.zeros((len(test_texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(test_reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                try:
                    if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                        test_data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
                except:
                    test_data[i, j, k] = 0
                    k = k + 1
                    continue

test_labels = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', test_data.shape)
print('Shape of label tensor:', test_labels.shape)

indices = np.arange(test_data.shape[0])
np.random.shuffle(indices)
test_data = test_data[indices]
test_labels = test_labels[indices]

# Loading and processing GloVe embedding

In [None]:
'''
Creating work embedding metrix using GloVe model. This is acheived using a pre-trained
GloVe model(6 Billion, 100 Dimentions).
'''

embeddings_index = dict()
f = open('drive/My Drive/Data/GloVe/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


EMBED_SIZE = 100

min_wordCount = 2
absent_words = 0
small_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
word_counts = tokenizer.word_counts
for word, i in word_index.items():
    if word_counts[word] > min_wordCount:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

# Attention Layer implementation

In [None]:
# Attention Layer
def dot_product(x, kernel):
    '''
    calculating dot product between 2 metrix
    '''
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionLayer(Layer):
    '''
    The Attention layer
    '''

    def __init__(self, attention_dim=100,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')


        self.bias = bias
        
        self.attention_dim = attention_dim
        
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        '''
        Building the Attention layer 
        '''
        
        assert len(input_shape) == 3

        self.W = self.add_weight((self.attention_dim, input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name)
                                 )
        if self.bias:
            self.b = self.add_weight((self.attention_dim,),
                                     initializer='zero',
                                     name='{}_b'.format(self.name))

        self.u = self.add_weight((self.attention_dim,),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name))

        super(AttentionLayer, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

# Hierarchical Attention Network(HAN) implementation

In [None]:
# HAN model
'''
This is the hierarchical attention network. This include 2 layers of network,
one with work level attention and another is sentence level attention.

The first layer contains a word embedding layer, a bi-directional GRU layer, a 
dropout layer, a word level attention layer and finally an encoder.

In the second layer, we have the input layer(taking input from previous layer), 
then a Time distributed layer,  a bi-directional GRU layer, a 
dropout layer, a sentence level attention layer and a fully connected dense layer
for output.
'''

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dropout = Dropout(0.5)(l_lstm)
l_att = AttentionLayer(100)(l_lstm)
sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_dropout_1 = Dropout(0.5)(l_lstm_sent)
l_att_sent = AttentionLayer(100)(l_lstm_sent)
preds = Dense(n_classes, activation='softmax')(l_att_sent)
model = Model(review_input, preds)
optimizer = keras.optimizers.Adam(lr=0.001)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
	
data_path="drive/My Drive/Data/" + dataset_name
checkpointer = ModelCheckpoint(filepath=data_path + 'han-model-{epoch:02d}.hdf5', verbose=1)

# Model Fitting

In [None]:
# Model Fitting
history = model.fit(x_train, y_train, validation_split=0.2, nb_epoch=10, batch_size=64)

# Save Model

In [None]:
# Saving the model and its histort
model.save('drive/My Drive/Data/' + dataset_name + '/HAN.h5')
data_path = 'drive/My Drive/Data/' + dataset_name
with open(data_path+'/trainHistoryDict_HAN.pkl', 'wb') as file_pi:
   pickle.dump(history.history, file_pi)

# Evaluate Test

In [None]:
# Evaluate model on test data
model.evaluate(test_data, test_labels)