<a href="https://colab.research.google.com/github/QuickLearner171998/Weighted-RNN-for-News-Text-Classification/blob/master/WRNN_News_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
% cd "/content/drive/My Drive/nnfl_project/"

/content/drive/My Drive/nnfl_project


In [0]:
%tensorflow_version 1.x
from sklearn.datasets import fetch_20newsgroups
from keras.utils import to_categorical
import os
import glob
import numpy as np
import pickle
from tqdm import tqdm
from keras.layers import Activation, Dense, Embedding, Flatten, LSTM, Input, Reshape, Conv1D,Permute
from keras.models import Sequential
from keras.regularizers import l1_l2
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.metrics import categorical_accuracy
from keras.models import load_model, Model
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model

PATH = '/content/drive/My Drive/nnfl_project/'
DATA_PATH = '/content/drive/My Drive/nnfl_project/data'
GLOVE_PATH = '/content/drive/My Drive/glove.840B.300d.txt'

In [0]:
def clean_data(docs):
  pass

def text_length_selection(docs, theta = 0.85):
  """ 
  Input-
    doc : newsgroups_train.data - list of strings 
    theta - threshold - 0.85(as in the paper)
  Output-
    sl
  """
  # list having number of words in each document.
  DL = [len(docs[i].split()) for i in range(len(docs))]
  DNum =len(DL)
  sort_DL = sorted(DL)
  for SL in set(sort_DL):
    thresh = sort_DL.index(SL)/DNum
    if(thresh>=theta):
      return SL

def tokenize_sequences(texts, sl):

  tokenizer = Tokenizer(oov_token='<UNK>')
  tokenizer.fit_on_texts(texts)

  sequences = tokenizer.texts_to_sequences(texts)
  sequences = pad_sequences(sequences, maxlen=sl)

  return tokenizer, sequences

def get_data(data_path):
  newsgroups_data = fetch_20newsgroups(data_path, subset='all')
  labels = newsgroups_data.target
  texts = newsgroups_data.data
  return labels, texts

def get_epoch_frm_model_file(save_path):
  return int(save_path[:save_path.index('_')])
  # return int(save_path.split('_')[2])

def return_last_saved_model(paths):
  last_epoch_prev = -1
  ret_path = ""
  for path in paths:
    last_epoch = get_epoch_frm_model_file(path)
    if(last_epoch >= last_epoch_prev):
      last_epoch_prev = last_epoch
      ret_path = path
  return ret_path


In [0]:
def get_embedding_matrix(tokenizer, path, embedding_dim):
    """
    :param path: path to the glove embeddings file
    :param tokenizer: tokenizer fitted on the documents
    :param vocab_size: vocabulary size 
    :return: an embedding matrix: a nn.Embeddings
    """
    # Load the GloVe vectors into a dictionary, keeping only words in vocab
    # embedding_dim = 300
    # glove_path = "/content/drive/My Drive/glove.840B.300d.txt"
    glove_path = path
    glove_vectors = {}
    glove_file = open(glove_path, 'r')
    for line in tqdm(glove_file):
        split_line = line.rstrip().split()
        word = split_line[0]
        if len(split_line) != (embedding_dim + 1) or word not in tokenizer.word_index:
            continue
        assert (len(split_line) == embedding_dim + 1)
        vector = np.array([float(x) for x in split_line[1:]], dtype="float32")
        assert len(vector) == embedding_dim
        glove_vectors[word] = vector

    glove_file.close()

    print("Number of pre-trained word vectors loaded: ", len(glove_vectors))

    # Calculate mean and stdev of embeddings
    all_embeddings = np.array(list(glove_vectors.values()))
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_stdev = float(np.std(all_embeddings))

    vocab_size = len(tokenizer.word_index)
    # Randomly initialize an embedding matrix of (vocab_size, embedding_dim) shape
    # with a similar distribution as the pretrained embeddings for words in vocab.
    embedding_matrix = np.random.normal(embeddings_mean, embeddings_stdev, (vocab_size, embedding_dim))

    for i, word in enumerate(tokenizer.word_index):
        if word in glove_vectors:
            embedding_matrix[i] = glove_vectors[word]

    return embedding_matrix

In [0]:
# in paper
embed_dim = 300 
hidden_dim  = 128
batch_size  = 128
epochs = 100
init_epoch = 0
sl = 300


In [0]:

save_embedding_matrix_path = PATH + 'embedding_matrix.pkl'

labels, texts = get_data(DATA_PATH)
tokenizer, tokenized_seq = tokenize_sequences(texts, sl)
vocab_size = len(tokenizer.word_index)

In [8]:
if(glob.glob(save_embedding_matrix_path)):
  print('Embedding matrix found. Loading ...')
  with open(save_embedding_matrix_path, 'rb') as f:
    embedding_matrix = pickle.load(f)

  print('Done.')

else:
  embedding_matrix = get_embedding_matrix(tokenizer, GLOVE_PATH, embed_dim)
  with open(save_embedding_matrix_path, 'wb+') as f:
    pickle.dump(embedding_matrix, f)


Embedding matrix found. Loading ...
Done.


In [0]:
# convert labels to one hot
labels = to_categorical(np.asarray(labels))
output_dim = labels.shape[1]

In [10]:
print(tokenized_seq.shape)
print(labels.shape)
vocab_size

(18846, 300)
(18846, 20)


179210

In [24]:
# model1_conv1d
inp = Input(shape= (sl,))
embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True )(inp)
lstm = LSTM(hidden_dim,return_sequences=True,recurrent_dropout=0.2,dropout=0.2)(embed) # o/p 300x128
# f1 = Flatten()(lstm)
# r2 = Reshape((hidden_dim,embed_dim))(f1)
pm = Permute((2,1))(lstm) # interchange dimension --> flatten+reshape
conv1d = Conv1D(1, 1, activation='relu' )(pm)
f = Flatten()(conv1d)
d = Dense(hidden_dim,activation='relu')(f)
out = Dense(output_dim, activation='softmax', activity_regularizer=l1_l2(0.01,0.01))(d)
model = Model(input = inp, outputs = out)
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 300, 300)          53763000  
_________________________________________________________________
lstm_6 (LSTM)                (None, 300, 128)          219648    
_________________________________________________________________
permute_1 (Permute)          (None, 128, 300)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 128, 1)            301       
_________________________________________________________________
flatten_9 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               1651

  # This is added back by InteractiveShellApp.init_path()


In [0]:
# # model2_conv1d
# inp = Input(shape= (sl,)) # 300,
# embed = Embedding(input_dim=vocab_size, output_dim=embed_dim, weights= [embedding_matrix], input_length=sl, trainable = True )(inp) # 300,300
# lstm = LSTM(hidden_dim,return_sequences=True,recurrent_dropout=0.2,dropout=0.2)(embed) # o/p 300x128
# f1 = Flatten()(lstm)
# r2 = Reshape((hidden_dim,embed_dim))(f1) # 128,300
# # conv1d = Conv1D(128, 1, activation='relu' )(r2) # 128,128
# d2 = Dense(1,activation='relu')(r2)
# f = Flatten()(d2)
# d = Dense(hidden_dim,activation='relu')(f)
# out = Dense(output_dim, activation='softmax', activity_regularizer=l1_l2(0.01,0.01))(d)
# model = Model(input = inp, outputs = out)
# model.summary()

In [25]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01),metrics=['categorical_accuracy'])


# check if a saved_model exists
if(glob.glob('*.hdf5')):
  print("loading model")
  load_path = return_last_saved_model(glob.glob('*.hdf5'))
  # model=load_model(load_path)
  model.load_weights(load_path)
  init_epoch = get_epoch_frm_model_file(load_path)

chk_pth = "{epoch:02d}_.hdf5"

# early stopping
earlyStop = EarlyStopping(monitor='val_categorical_accuracy', mode='auto', verbose=1,patience=5)
checkpoint = ModelCheckpoint(chk_pth,monitor='val_categorical_accuracy', verbose=1, save_weights_only=True, save_best_only='False',mode = 'auto', period=2)

print("START TRAINING")

history = model.fit(tokenized_seq,labels,epochs=init_epoch+epochs, batch_size=batch_size, validation_split=0.1,shuffle = True, callbacks=[checkpoint,earlyStop], initial_epoch=init_epoch)
print("Training fininshed....Saving Model")
save_path = "{}_epochs-model_weights.hdf5".format(init_epoch+epochs)
# model.save_weights(save_path)



loading model
START TRAINING
Train on 16961 samples, validate on 1885 samples
Epoch 3/102
Epoch 4/102

Epoch 00004: val_categorical_accuracy improved from -inf to 0.73846, saving model to 04_.hdf5
Epoch 5/102
Epoch 6/102

Epoch 00006: val_categorical_accuracy improved from 0.73846 to 0.80053, saving model to 06_.hdf5
Epoch 7/102
Epoch 8/102

Epoch 00008: val_categorical_accuracy improved from 0.80053 to 0.81698, saving model to 08_.hdf5
Epoch 9/102
Epoch 10/102

Epoch 00010: val_categorical_accuracy improved from 0.81698 to 0.83236, saving model to 10_.hdf5
Epoch 11/102
Epoch 12/102

Epoch 00012: val_categorical_accuracy did not improve from 0.83236
Epoch 13/102
Epoch 00013: early stopping
Training fininshed....Saving Model


In [0]:
# plot acc
import matplotlib.pyplot as plt
plt.plot(history.history["categorical_accuracy"])
plt.plot(history.history["val_categorical_accuracy"])
plt.title('model acc')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [0]:
# plot loss
import matplotlib.pyplot as plt
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [0]:
# save model ploy
plot_model(model,to_file='model1_conv1d.png',show_shapes=True)