<a href="https://colab.research.google.com/github/QuickLearner171998/Weighted-RNN-for-News-Text-Classification/blob/master/WRNN_News_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [0]:
% cd "/content/drive/My Drive/nnfl_project/"

# Imports


In [0]:
%tensorflow_version 1.x

from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups
from keras.utils import to_categorical
import os
import glob
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm

from keras.layers import *
#Input,Activation, Dense, Embedding, Flatten, LSTM, Conv1D,MaxPooling1D,Dropout,Bidirectional,Reshape,MaxPooling1D,GlobalMaxPooling1D, Dropout,Permute,TimeDistributed,BatchNormalization
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.regularizers import l1_l2
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy
from keras.models import load_model
from keras import regularizers
from keras.utils.vis_utils import plot_model

from gensim.models import Word2Vec,KeyedVectors
from gensim.test.utils import datapath, get_tmpfile

PATH = '/content/drive/My Drive/nnfl_project/'
DATA_PATH = '/content/drive/My Drive/nnfl_project/data'
GLOVE_PATH = '/content/drive/My Drive/glove.840B.300d.txt'

# Helper Functions

In [0]:

def evaluate(model,y_texts, y_labels, batch_size):
  loss, acc = model.evaluate(y_texts,y_labels,batch_size=batch_size,verbose=1)
  # print("Accuracy of the model",acc)
  y_pred = model.predict(y_texts, batch_size=batch_size)
  y_pred_max_ind = np.argmax(y_pred,axis=1)
  y_true_max_ind = np.argmax(y_labels,axis=1)
  cf = (classification_report(y_true_max_ind,y_pred_max_ind))

  print("\n Model Accuracy : ",acc)
  print("\nClassification Report\n ",cf)



# model_op_folder = 'new_conv1d_GMP_split'
def get_callbacks():

  chk_pth = ("{epoch:02d}_{val_categorical_accuracy:.2f}.h5")
  # early stopping
  earlyStop = EarlyStopping(monitor='val_loss', mode='auto', verbose=1,patience=16)
  checkpoint = ModelCheckpoint(chk_pth,monitor='val_loss', verbose=1, save_weights_only=True, save_best_only='True',mode = 'auto', period=2)

  # reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001, verbose = 1)
  cb = [checkpoint,earlyStop]
  return cb



def run_model(model,train_tokenized_seq,train_labels,epochs,batch_size,vsplit,init_epoch=0):

  model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01),metrics=['categorical_accuracy'])

  # check if a saved_model exists
  if(glob.glob('*.h5')):
    print("loading model")
    load_path = return_last_saved_model(glob.glob('*.h5'))
    # model = load_model(load_path)
    model.load_weights(load_path)
    init_epoch = get_epoch_frm_model_file(load_path)


  print("START TRAINING")

  # history = model.fit(train_tokenized_seq,train_labels,epochs = init_epoch+epochs, batch_size=batch_size, validation_data=(test_tokenized_seq, test_labels), callbacks=[checkpoint,earlyStop], initial_epoch=init_epoch)
  history = model.fit(train_tokenized_seq,train_labels,epochs = init_epoch+epochs, batch_size=batch_size, validation_split=vsplit, callbacks=get_callbacks(), initial_epoch=init_epoch)


  print("Training fininshed")
  # save_path = "{}_epochs-model.h5".format(init_epoch+epochs)
  # model.save(save_path)
  return history



# plot model

def training_plots(history, model_name,model):  
  plt.plot(history.history["categorical_accuracy"])
  plt.plot(history.history["val_categorical_accuracy"])
  plt.title('model acc')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()

  plt.plot(history.history["loss"])
  plt.plot(history.history["val_loss"])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epochs')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()

  plot_model(model,to_file=model_name,show_shapes=True)


def tokenize_sequences(texts, sl,tokenizer):
  sequences = tokenizer.texts_to_sequences(texts)
  sequences = pad_sequences(sequences, maxlen=sl)

  return sequences

def get_data(data_path):
  newsgroups_data_train = fetch_20newsgroups(data_path,subset='train')
  train_labels = newsgroups_data_train.target
  train_texts = newsgroups_data_train.data

  newsgroups_data_test = fetch_20newsgroups(data_path,subset='test')
  test_labels = newsgroups_data_test.target
  test_texts = newsgroups_data_test.data
  return train_texts,train_labels, test_texts, test_labels



def get_epoch_frm_model_file(save_path):
  return int(save_path[:save_path.index('_')])
  # return int(save_path.split('_')[2])

def return_last_saved_model(paths):
  last_epoch_prev = -1
  ret_path = ""
  for path in paths:
    last_epoch = get_epoch_frm_model_file(path)
    if(last_epoch >= last_epoch_prev):
      last_epoch_prev = last_epoch
      ret_path = path
  return ret_path


def get_embedding_matrix(tokenizer, glove_path, embedding_dim):
    """
    :param path: path to the glove embeddings file
    :param tokenizer: tokenizer fitted on the documents
    :param vocab_size: vocabulary size 
    :return: an embedding matrix: a nn.Embeddings
    """
    glove_vectors = {}
    glove_file = open(glove_path, 'r')
    for line in tqdm(glove_file):
        split_line = line.rstrip().split()
        word = split_line[0]
        if len(split_line) != (embedding_dim + 1) or word not in tokenizer.word_index:
            continue
        assert (len(split_line) == embedding_dim + 1)
        vector = np.array([float(x) for x in split_line[1:]], dtype="float32")
        assert len(vector) == embedding_dim
        glove_vectors[word] = vector

    glove_file.close()

    print("Number of pre-trained word vectors loaded: ", len(glove_vectors))

    # Calculate mean and stdev of embeddings
    all_embeddings = np.array(list(glove_vectors.values()))
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_stdev = float(np.std(all_embeddings))

    vocab_size = len(tokenizer.word_index)
    # Randomly initialize an embedding matrix of (vocab_size, embedding_dim) shape
    # with a similar distribution as the pretrained embeddings for words in vocab.
    embedding_matrix = np.random.normal(embeddings_mean, embeddings_stdev, (vocab_size, embedding_dim))

    for i, word in enumerate(tokenizer.word_index):
        if word in glove_vectors:
            embedding_matrix[i] = glove_vectors[word]

    return embedding_matrix


# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab,embed_dim_w):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, embed_dim_w))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix



def get_w2v_matrix(vocab, embed_dim_w):
  # load embedding from file
  raw_embedding = load_embedding('word2vec_200.txt')
  # get vectors in the right order
  return(get_weight_matrix(raw_embedding, vocab,embed_dim_w))


def prepare_for_word_to_vec(texts):
  words_list = []
  for text in texts:
    tokens = word_tokenize(text)
    words = [word for word in tokens if word.isalpha()]
    words_list.append(words)
  return words_list


# Configs

In [0]:
# in paper
vocab_size = None
embed_dim = 300 
embed_dim_w = 200
hidden_dim  = 128
batch_size  = 128
epochs = 500
init_epoch = 0
sl = 300
vsplit = 0.05

# Process Data

In [0]:
# save_embedding_matrix_path = PATH + 'embedding_matrix.pkl'
train_texts,train_labels, test_texts, test_labels = get_data(DATA_PATH)

# Should fit_to_texts complete or only train-  https://stackoverflow.com/questions/54891464/is-it-better-to-keras-fit-to-text-on-the-entire-x-data-or-just-the-train-data

tokenizer = Tokenizer(oov_token='<UNK>')
# fit only on train
tokenizer.fit_on_texts(train_texts)


train_tokenized_seq = tokenize_sequences(train_texts, sl,tokenizer)
test_tokenized_seq = tokenize_sequences(test_texts, sl,tokenizer)
vocab_size = len(tokenizer.word_index)# +1 for OOV

train_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))
output_dim = train_labels.shape[1]


In [0]:
vocab_size

# W2V model

In [0]:
# # word2vec mpdel
# path = get_tmpfile(PATH+'word2vec_200.txt')
# if(glob.glob('word2vec_200.txt')):
#   print("loading word2vec model")
#   model_w = KeyedVectors.load_word2vec_format(path)
# else:
#   import nltk
#   nltk.download('punkt')
#   from nltk.tokenize import word_tokenize
#   w_train = prepare_for_word_to_vec(train_texts)
#   w_test = prepare_for_word_to_vec(test_texts)
#   model_w = Word2Vec(w_train+w_test,size=embed_dim_w,sg=1,iter = 15)


# # load weights
# if(glob.glob('w2v_weights_200.pkl')):
#   print("loading w2v weights")
#   w2v_weights = np.load('w2v_weights_200.pkl', allow_pickle=True)
#   # vocab_size_w=vocab_size+1

# else:
#   # get weights
#   w2v_weights = get_w2v_matrix(tokenizer.word_index,embed_dim_w)
#   # save weights
#   pth = PATH+'w2v_weights_200.pkl'
#   with open(pth, 'wb+') as f:
#       pickle.dump(w2v_weights, f)

# vocab_size_w=vocab_size+1


# w2v_weights.shape
# print(vocab_size_w)


# Glove

In [0]:
save_embedding_matrix_path = PATH + 'embedding_matrix_with_split.pkl'

if(glob.glob(save_embedding_matrix_path)):
  print('Embedding matrix found. Loading ...')
  with open(save_embedding_matrix_path, 'rb') as f:
    embedding_matrix = pickle.load(f)

  print('Done.')

else:
  embedding_matrix = get_embedding_matrix(tokenizer, GLOVE_PATH, embed_dim)
  with open(save_embedding_matrix_path, 'wb+') as f:
    pickle.dump(embedding_matrix, f)


In [0]:
print(train_tokenized_seq.shape)
print(train_labels.shape)

print(embedding_matrix.shape)
print(vocab_size)

# Models

## 1) WRNN

In [0]:
# architecture 
inp = Input(shape= (sl,))
embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True )(inp)
lstm = LSTM(hidden_dim,return_sequences=True,recurrent_dropout=0.5,dropout=0.5)(embed) # o/p 300x128
# Note o/p of a rnn cel or lstm cell gives prob of each word - (300,)
# conv 1d inp shape - (128,300) -- reshape needed
f1 = Flatten()(lstm)
r2 = Reshape((hidden_dim,sl))(f1)
conv1d = Conv1D(128,1, activation='relu')(r2)
gp = GlobalMaxPooling1D()(conv1d)
d = Dense(hidden_dim,activation='relu')(gp)
out = Dense(output_dim, activation='softmax',kernel_initializer='he_normal', activity_regularizer=l1_l2(0.01,0.01))(d)
model_wrnn = Model(input = inp, outputs = out)
model_wrnn.summary()

# training
history = run_model(model_wrnn,train_tokenized_seq,train_labels,epochs,batch_size,vsplit)
training_plots(history, 'wrnn.png',model_wrnn)

# evaluation
print("Evaluating...")
evaluate(model_wrnn,test_tokenized_seq, test_labels,batch_size)




### w2v

In [0]:
# # W2V 


# inp = Input(shape= (sl,))
# embed = Embedding(input_dim=vocab_size_w, output_dim=embed_dim_w, weights= [w2v_weights], input_length=sl, trainable = True )(inp)
# # embed = Embedding(input_dim=vocab_size, output_dim=embed_dim_w,  input_length=sl, trainable = True )(inp)
# # embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=sl )(inp)
# lstm = LSTM(hidden_dim,return_sequences=True,recurrent_dropout=0.2,dropout=0.2)(embed) # o/p 300x128
# # Note o/p of a rnn cel or lstm cell gives prob of each word - (300,)
# # conv 1d inp shape - (128,300) -- reshape needed
# f1 = Flatten()(lstm)
# r2 = Reshape((hidden_dim,sl))(f1)
# conv1d = Conv1D(128,1, activation='relu' )(r2)
# gp = GlobalMaxPooling1D()(conv1d)
# # dense = Dense(1,activation='relu')(conv1d)
# # f = Flatten()(gp)
# d = Dense(hidden_dim,activation='relu')(gp)
# out = Dense(output_dim, activation='softmax', activity_regularizer=l1_l2(0.01,0.01))(d)
# model = Model(input = inp, outputs = out)
# model.summary()


In [0]:
# # bidirectional 0.88


# inp = Input(shape= (sl,))
# embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True )(inp)
# # embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=sl )(inp)
# lstm = Bidirectional(LSTM(hidden_dim,return_sequences=True,recurrent_dropout=0.2,dropout=0.2))(embed) # o/p 300x256
# # Note o/p of a rnn cel or lstm cell gives prob of each word - (300,)
# # conv 1d inp shape - (128,300) -- reshape needed
# f1 = Flatten()(lstm)
# r2 = Reshape((2*hidden_dim,sl))(f1)
# conv1d = Conv1D(hidden_dim,1, activation='relu' )(r2)
# # gp = MaxPooling1D(32)(conv1d)
# gp = GlobalMaxPooling1D()(conv1d)
# # dense = Dense(1,activation='relu')(conv1d)
# # f = Flatten()(gp)
# d = Dense(hidden_dim,activation='relu')(gp)
# out = Dense(output_dim, activation='softmax', activity_regularizer=l1_l2(0.01,0.01))(d)
# model = Model(input = inp, outputs = out)
# model.summary()


## 2) Simple RNN

In [0]:
model_rnn = Sequential([Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True ),
                        LSTM(hidden_dim,recurrent_dropout=0.2,dropout=0.2),
                        Dense(hidden_dim,activation='relu'),
                        Dense(output_dim,activation='softmax')])

history_rnn = run_model(model_rnn,train_tokenized_seq,train_labels,epochs,batch_size,vsplit)
training_plots(history_rnn, 'simple_rnn.png', model_rnn)



print("Evaluating...")
evaluate(model_rnn,test_tokenized_seq, test_labels,batch_size)


## 3) BiLSTM

In [0]:
model_BiLSTM = Sequential([Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True ),
                        Bidirectional(LSTM(hidden_dim,recurrent_dropout=0.2,dropout=0.2)),
                        Dense(hidden_dim,activation='relu'),
                        Dense(output_dim,activation='softmax')])

history_bilstm = run_model(model_BiLSTM,train_tokenized_seq,train_labels,epochs,batch_size,vsplit)
training_plots(history_bilstm, 'BiLSTM.png', model_BiLSTM)


print("Evaluating...")
evaluate(model_BiLSTM,test_tokenized_seq, test_labels,batch_size)

