# HAN for Text Classification


*   Load Libraries
*   Connect to Google Drive
*   Code to clean the text
*   Load Data
*   Set Parameters
*   Clean and prepare train and test data
*   Load GloVe embedding model
*   Model Architecture
*   Fit Model
*   Store Model
*   Evaluate Model


# Load all Libraries and download nltk packages

In [0]:
# Load libraries
import os
import sys
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D, MaxPooling1D
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, SpatialDropout1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import re
from keras.models import Sequential
from keras.metrics import top_k_categorical_accuracy
import nltk
nltk.download('stopwords')
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedShuffleSplit

# Connect to Google Drive

Use the URL to connect to google drive and give premission for colab to access Goole drive.

In [0]:
from google.colab import drive
drive.mount('/content/drive')
#Check GPU
import tensorflow as tf
tf.test.gpu_device_name()

# Code to Clean the raw text

In [0]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = nltk.SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

# Load Data

Uncomment the dataset that you want to run.

In [0]:
# Loading Dataset
dataset_name = 'IMDB'
#dataset_name = 'AG_News'
#dataset_name = 'Amazon'


if dataset_name == 'IMDB':
  train_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/train.csv")
  test_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/test.csv")
  
if dataset_name == 'AG_News':
  train_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/train.csv")
  test_df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/test.csv")
  train_df['label'] = train_df['label'] -1
  test_df['label'] = test_df['label'] -1
  
if dataset_name == 'Amazon':
  df = pd.read_csv("drive/My Drive/Data/" + dataset_name + "/Amazon_Data.csv")
  stratsplit = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
  for train_index, test_index in stratsplit.split(df, df['label']):
    train_df = df.loc[train_index]
    test_df = df.loc[test_index]
    train_df.dropna(subset = ['text'],inplace=True)
    test_df.dropna(subset = ['text'],inplace=True)
    train_df.reset_index(drop=True,inplace=True)
    test_df.reset_index(drop=True,inplace=True)
    train_df['label'] = train_df['label'] -1
    test_df['label'] = test_df['label'] -1
    break

# Clean Data

In [0]:
train_df = train_df[train_df.text.apply(lambda x: x !="")]
test_df = test_df[test_df.text.apply(lambda x: x !="")]
train_df['text'] = train_df['text'].map(lambda x: clean_text(x))
test_df['text'] = test_df['text'].map(lambda x: clean_text(x))
num_classes = test_df['label'].nunique()
list_sentences_train = train_df["text"].fillna("_na_").values
train_y = to_categorical(train_df['label'].values,num_classes=num_classes)
list_sentences_test = test_df["text"].fillna("_na_").values
test_y = to_categorical(test_df['label'].values,num_classes=num_classes)

# Loading and processing GloVe embedding

In [0]:
# Loading word embedding
embeddings_index = dict()
f = open('drive/My Drive/Data/GloVe/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(train_df['text'])

sequences = tokenizer.texts_to_sequences(train_df['text'])
sequences_test = tokenizer.texts_to_sequences(test_df['text'])
data_test = pad_sequences(sequences_test, maxlen=50)
data = pad_sequences(sequences, maxlen=50)

# Create Embedding metrix

In [0]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

# Convolutional Neural Network (CNN) implementation

In [0]:
# Model architecture
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False)) #set trainbale to true and check
model.add(SpatialDropout1D(0.7))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(SpatialDropout1D(0.7))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(SpatialDropout1D(0.7))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Fitting

In [0]:
history = model.fit(data, train_y, epochs=1, validation_split=0.2, batch_size=32, callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.01)])

# Save Model

In [0]:
# Saving the model and its histort
model.save('drive/My Drive/Data/' + dataset_name + '/CNN.h5')
data_path = 'drive/My Drive/Data/' + dataset_name
with open(data_path+'/trainHistoryDict_CNN.pkl', 'wb') as file_pi:
   pickle.dump(history.history, file_pi)

# Evaluate Model

In [0]:
model.evaluate(data_test, test_y)