# Steps to execute the notebook
<ol>
    <li>Change Global variables depending upon the training phase and testing phase.</li>
    <li>Change gloabal variable CREATE_OBJECTS to true in case the objects are not created.</li>
    <li>Once the objects are created copy those objects in the same folder where this notebook is present.</li>
    
</ol>

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Install & Import Python libraries

In [None]:
!conda install pandas numpy matplotlib seaborn scikit-learn keras

In [None]:
import numpy as np
import pandas as pd
import utils
import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns    

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, Bidirectional ,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow

import warnings
import gc

warnings.simplefilter("ignore")
gc.enable()

# Global Variables

In [None]:
CREATE_OBJECTS = False
TRAIN = False
VERBOSE = False
MODEL = 'domain_model_lstm.h5'

# Create the vocabulary, embedding matrix, and generate the pre-processed dataset from raw text files

In [None]:
# 1. Download the glove evectors from http://nlp.stanford.edu/data/glove.840B.300d.zip
# 2. Unzip the downloaded file
# 3. Copy the path of unziped text file
# 4. Paste the path at line no. 59 of utils.py
if CREATE_OBJECTS:
    utils.create_tde_objects()
    utils.create_dataset()

In [None]:
X = []
Y = []
y=[]
sentences = []

directory = './dataset/data/'
labels = {}
labels['religious'] = 0
labels['medical'] = 1
labels['business'] = 2
labels['political'] = 3
labels['personal'] = 4
labels['research'] = 5
labels['sports'] = 6
labels['terror'] = 7

# Load the Dataset

In [None]:

if os.path.exists('./X.npy') and os.path.exists('./Y.npy') and os.path.exists('./y.npy'):
    X = np.load('X.npy')
    Y = np.load('Y.npy')
    y = np.load('y.npy')
    with open('sentences.txt', 'a+') as fp:
        global sentences
        sentences = fp.readlines()
else:
    t = utils.load_pickle_object('tokenizer_small.pickle')

    for file in os.listdir(directory):
        if labels[file.split('_')[0]] not in [-1]:
            doc = np.load(directory+file)
            sentences.append(t.sequences_to_texts([doc]))
            X.append(doc)
            Y.append(to_categorical(labels[file.split('_')[0]], 8))
            y.append(labels[file.split('_')[0]])

    X, Y, y = np.array(X), np.array(Y), np.array(y)

    print("Shape of X train:", X.shape, "Shape of Y train(8):", Y.shape)

    np.save('X', X)
    np.save('Y', Y)
    np.save('y', y)
    
    del doc
    del t
    
gc.collect(2)

In [None]:
# Calculate the weight matrix
weights = dict(1/(pd.Series(y).value_counts()/pd.Series(y).value_counts().sum()))
print(weights)

In [None]:
if not os.path.exists(MODEL):
    
    sequence_input = Input(shape=(65,), dtype='float32')
    embedded_sequences = Embedding(202076, 65, input_length = 65, trainable = True)(sequence_input)
    
    l_bilstm = Bidirectional(LSTM(32))(embedded_sequences)
    
#     l_flat = Flatten()(l_bilstm)
    
    l_dense = Dense(64, activation='relu')(l_bilstm)
    preds = Dense(8, activation='softmax')(l_dense)
    
    model = Model(sequence_input, preds)
    
    
else:
    print("Loading the model...", MODEL)
    model = load_model(MODEL)

# Define the callbacks

chkpt = tensorflow.keras.callbacks.ModelCheckpoint(MODEL, monitor='val_loss', verbose=1, save_best_only=True)
erlystpng = tensorflow.keras.callbacks.EarlyStopping(patience = 100)
optimizer = tensorflow.keras.optimizers.Adam(0.00001, epsilon=0.000001)
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=optimizer)


print(model.summary())

gc.collect(2)

In [None]:
X_train, X_val, Y_train, Y_val, y_train, y_val = train_test_split(X, Y, y, test_size=0.1)
if TRAIN:
    try:
        model.fit(X_train, Y_train, epochs=300, verbose=1, callbacks = [erlystpng, chkpt], batch_size=64, validation_data=(X_val, Y_val), class_weight = weights)
    finally:
        model.save('final_'+MODEL)
else:
    model.evaluate(X_val, Y_val, verbose=1, batch_size=8)