## This is for Categorical Classification between various classes.


LABELS :-

0 - ENTERTAINMENT
1 - POLITICS
2 - SPORTS
3 - BUSINESS
4 - EDUCATION



#### There are total 3 Cells in this notebook.
1) Main Cell for machine learning model training and dataset creation, etc.
2) Creates a classification report of the model using the testing set.
3) This cell contains all the important functions necessary


#### Order of running of cells...
First you need to run the 3rd cell for defining the functions. Then run the first cell to start the model training. Now run the 3rd cell to generate a classification report.



#### NOTE:- You need to change the required paths to various resourcces given in the first cell to run the code properly.

In [None]:
import os
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
import h5py
#import utility_functions as uf
from keras.models import model_from_json
from keras.models import load_model
from nltk.tokenize import RegexpTokenizer
from sklearn.utils import shuffle

def load_data_all(data_dir, all_data_path,pred_path, gloveFile, first_run, load_all):
    
    # Load embeddings for the filtered glove list
    if load_all == True:
        weight_matrix, word_idx = load_embeddings(gloveFile)
    else:
        weight_matrix, word_idx = load_embeddings(filtered_glove_path)

    len(word_idx)
    len(weight_matrix)

    #%%
    # create test, validation and trainng data
    all_data = read_data(all_data_path)
    #print(all_data)

    # Shuffling Dataset
    
    print('Shuffling Dataset...')
    all_data = shuffle(all_data)
    print('Shuffling Finished...')
    all_data = all_data
    print(all_data)

    train_data, test_data, dev_data = training_data_split(all_data, 0.8, data_dir)

    train_data = train_data.reset_index()
    dev_data = dev_data.reset_index()
    test_data = test_data.reset_index()

    #%%
    # inputs from dl_sentiment that are hard coded but need to be automated
    maxSeqLength, avg_words, sequence_length = maxSeqLen(all_data)
    #numClasses = 10
    numClasses = 5

    #%%

     # load Training data matrix
    print('Load training data matrix start...')
    train_x = tf_data_pipeline_nltk(train_data, word_idx, weight_matrix, maxSeqLength)
    test_x = tf_data_pipeline_nltk(test_data, word_idx, weight_matrix, maxSeqLength)
    val_x = tf_data_pipeline_nltk(dev_data, word_idx, weight_matrix, maxSeqLength)
    print('Load training data matrix end...')

    #%%
    # load labels data matrix
    print('Load labels data matrix start...')
    train_y = labels_matrix(train_data)
    val_y = labels_matrix(dev_data)
    test_y = labels_matrix(test_data)
    print('Load labels data matrix end...')


     #%%

    # summarize size
    print("Training data: ")
    print(train_x.shape)
    print(train_y.shape)

    # Summarize number of classes
    print("Classes: ")
    print(np.unique(train_y.shape[1]))

    return train_x, train_y, test_x, test_y, val_x, val_y, weight_matrix, word_idx

def create_model_rnn(weight_matrix, max_words, EMBEDDING_DIM):

    # create the model
    model = Sequential()
    model.add(Embedding(len(weight_matrix), EMBEDDING_DIM, weights=[weight_matrix], input_length=max_words, trainable=False))
    model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.20))
    #model.add(Dense(1024, activation='relu'))
    #model.add(Dropout(0.20))
    #model.add(Dense(512, activation='relu'))
    #model.add(Dropout(0.20))
    model.add(Dense(5, activation='softmax'))
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    return model

def train_model(model,train_x, train_y, test_x, test_y, val_x, val_y, batch_size, path, epochs) :

    # save the best model and early stopping
    saveBestModel = keras.callbacks.ModelCheckpoint(path+'/categorical/model/best_model.hdf5', monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')

    # Fit the model
    model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs,validation_data=(val_x, val_y), callbacks=[saveBestModel, earlyStopping])
    # Final evaluation of the model
    score, acc = model.evaluate(test_x, test_y, batch_size=batch_size)

    print('Test score:', score)
    print('Test accuracy:', acc)

    return model

def live_test(trained_model, data, word_idx):

    #data = "Pass the salt"
    #data_sample_list = data.split()
    live_list = []
    live_list_np = np.zeros((139,1))
    # split the sentence into its words and remove any punctuations.
    tokenizer = RegexpTokenizer(r'\w+')
    data_sample_list = tokenizer.tokenize(data)

    labels = np.array(['0','1','2','3','4'], dtype = "int")
    #word_idx['I']
    # get index for the live stage
    data_index = np.array([word_idx[word.lower()] if word.lower() in word_idx else 0 for word in data_sample_list])
    data_index_np = np.array(data_index)
    #print(data_index_np)

    # padded with zeros of length 56 i.e maximum length
    padded_array = np.zeros(139) # use the def maxSeqLen(training_data) function to detemine the padding length for your data
    padded_array[:data_index_np.shape[0]] = data_index_np
    data_index_np_pad = padded_array.astype(int)
    live_list.append(data_index_np_pad)
    live_list_np = np.asarray(live_list)
    type(live_list_np)

    # get score from the model
    score = trained_model.predict_classes(live_list_np, batch_size=1, verbose=0)
    return score

    '''single_score = np.round(np.argmax(score)/10, decimals=2) # maximum of the array i.e single band

    # weighted score of top 3 bands
    top_3_index = np.argsort(score)[0][-3:]
    top_3_scores = score[0][top_3_index]
    top_3_weights = top_3_scores/np.sum(top_3_scores)
    single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)'''

    #print (single_score)
    #return single_score_dot


max_words = 56 # max no of words in your training data
batch_size = 500 # batch size for training
EMBEDDING_DIM = 100 # size of the word embeddings
epochs = 25
train_flag = False # set True if in training mode else False if in prediction mode
first_time = True
path = 'drive/My Drive/deepsentiment'

if train_flag:
    # create training, validataion and test data sets
    # load the dataset
    path = 'drive/My Drive/deepsentiment'
    data_dir = path+'/Data'
    all_data_path = path+'/Data/'
    pred_path = path+'/Data/output_model/test_pred.csv'
    gloveFile = path+'/Data/glove/glove_twitter_27B_100d.txt'
    first_run = False
    load_all = True

    train_x, train_y, test_x, test_y, val_x, val_y, weight_matrix, word_idx = load_data_all(data_dir, all_data_path,pred_path, gloveFile, first_run, load_all)
    # create model strucutre
    model = create_model_rnn(weight_matrix, train_x.shape[1], EMBEDDING_DIM)

    # train the model
    trained_model =train_model(model,train_x, train_y, test_x, test_y, val_x, val_y, batch_size, path, epochs)   # run model live


    # serialize weights to HDF5
    #model.save_weights(path+"/categorical/model/best_model.hdf5")
    model.save(path+"/categorical/model/best_model.hdf5")
    print("Saved model to disk")

else:
    if first_time:
        
        ## Path to load GloVe embeddings
        gloveFile = path+'/Data/glove/glove_twitter_27B_100d.txt'
        weight_matrix, word_idx = load_embeddings(gloveFile)
        
        ## Path to load a saved model for testing
        model_path = path +'/categorical/model/best_model_deep_4L.h5'
        trained_model = trained_model(model_path)
        trained_model.summary()
    
    
    '''
    
            Insert your code here for appropriate testing.
            Sample Code:- 
            
            text = "Hello this is testing statement."
            result = live_test(loaded_model,text, word_idx)
            print(result)
    
    '''
    
    #loaded_model = trained_model
    #data_sample = "books study classroom class tuition"
    #live_test(loaded_model,data_sample, word_idx)
    #result = live_test(loaded_model,data_sample, word_idx)
    #print (result)





In [None]:
from sklearn.metrics import classification_report 
#print(test_y)

actual=[]
for i in range(len(test_y)):
  actual.append(np.where(test_y[i] == np.max(test_y[i]))[0][0])

predicted = trained_model.predict_classes(test_x)

print(classification_report(actual, predicted))

## This cell is for the necessary functions like loading dataset, glove embeddings and preparing training data.

Note:- Head over to the function "read_data()" to change the path for training dataset

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import codecs
import os
import progressbar as pb
from nltk.tokenize import RegexpTokenizer
from collections import Counter
#%%
################################### Paths to Data ########################################################################

path = '/Data/'
gloveFile = '/Data/glove/glove_6B_300d.txt' #'/Users/prajwalshreyas/Desktop/Singularity/Topic modelling/Glove/glove.twitter.27B/glove.twitter.27B.25d.txt'
vocab_path = '/Data/glove/vocab_glove.csv'

#Split Data path
train_data_path ='/Data/TrainingData/train.csv'
val_data_path ='/Data/TrainingData/val.csv'
test_data_path ='/Data/TrainingData/test.csv'

sent_matrix_path ='/Data/inputs_model/sentence_matrix.csv'
sent_matrix_path_val ='/Data/inputs_model/sentence_matrix_val.csv'
sent_matrix_path_test ='/Data/inputs_model/sentence_matrix_test.csv'
sequence_len_path = '/Data/inputs_model/sequence_length.csv'
sequence_len_val_path = '/Data/inputs_model/sequence_length_val.csv'
sequence_len_test_path = '/Data/inputs_model/sequence_length_test.csv'
wordVectors_path = '/Data/inputs_model/wordVectors.csv'
#%%#

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Filtered Vocabulary from Glove document >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def filter_glove(full_glove_path, data_dir):
  vocab = set()
  sentence_path = os.path.join(data_dir,'SOStr.txt')
  filtered_glove_path = os.path.join(data_dir, 'filtered_glove.txt')
  # Download the full set of unlabeled sentences separated by '|'.
  #sentence_path, = download_and_unzip(
    #'http://nlp.stanford.edu/~socherr/', 'stanfordSentimentTreebank.zip',
    #'stanfordSentimentTreebank/SOStr.txt')
  with codecs.open(sentence_path, encoding='latin-1') as f:
    for line in f:
      # Drop the trailing newline and strip backslashes. Split into words.
      vocab.update(line.strip().replace('\\', '').split('|'))
  nread = 0
  nwrote = 0
  with codecs.open(full_glove_path, encoding='latin-1') as f:
    with codecs.open(filtered_glove_path, 'w', encoding='latin-1') as out:
      for line in f:
        nread += 1
        line = line.strip()
        if not line: continue
        if line.split(u' ', 1)[0] in vocab:
          out.write(line + '\n')
          nwrote += 1
  print('read %s lines, wrote %s' % (nread, nwrote))
#%%#

#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Filtered Vocabulary from live cases >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>



# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< load embeddings >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def load_embeddings(embedding_path):
  """Loads embedings, returns weight matrix and dict from words to indices."""
  print('loading word embeddings from %s' % embedding_path)
  weight_vectors = []
  word_idx = {}
  i=0
  with codecs.open(embedding_path, encoding='latin-1') as f:
    for line in f:
      try:
        word, vec = line.split(u' ', 1)
        word_idx[word] = len(weight_vectors)
        weight_vectors.append(np.array(vec.split(), dtype=np.float32))
      except:
        continue

  print('For Loop Complete!!')
  # Annoying implementation detail; '(' and ')' are replaced by '-LRB-' and
  # '-RRB-' respectively in the parse-trees.
  word_idx[u'-LRB-'] = word_idx.pop(u'(')
  word_idx[u'-RRB-'] = word_idx.pop(u')')
  # Random embedding vector for unknown words.
  weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
  print('Word Embeddings Loaded!')
  return np.stack(weight_vectors), word_idx


# Combine and split the data into train and test
def read_data(path):
    print('read_data start...')
    '''
    # read dictionary into df
    df_data_sentence = pd.read_table(path + 'dictionary.txt')
    df_data_sentence_processed = df_data_sentence['Phrase|Index'].str.split('|', expand=True)
    df_data_sentence_processed = df_data_sentence_processed.rename(columns={0: 'Phrase', 1: 'phrase_ids'})

    # read sentiment labels into df
    df_data_sentiment = pd.read_table(path + 'sentiment_labels.txt')
    df_data_sentiment_processed = df_data_sentiment['phrase ids|sentiment values'].str.split('|', expand=True)
    df_data_sentiment_processed = df_data_sentiment_processed.rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})


    #combine data frames containing sentence and sentiment
    df_processed_all = df_data_sentence_processed.merge(df_data_sentiment_processed, how='inner', on='phrase_ids')
    '''

    '''df_all = pd.read_table(path + 'glove_processed_data.txt', error_bad_lines=False, encoding='latin-1')
    df_processed_all = df_all['index|polarity|tweet'].str.split('|', expand=True)
    df_processed_all = df_processed_all.rename(columns={0: 'phrase_ids', 1: 'sentiment_values', 2: 'Phrase'})'''

    df_processed_all = pd.read_csv(path + 'glove_processed_data_categorical.csv', sep="|")
    df_processed_all['polarity'] = df_processed_all['polarity'].astype(str)
    #df_processed_all = df_processed_all[~df_processed_all["polarity"].str.contains("5")]
    #df_processed_all['polarity'] = df_processed_all['polarity'].astype(int)
    Counter(df_processed_all['polarity'])

    print('read_data end...')

    return df_processed_all

def training_data_split(all_data, spitPercent, data_dir):
    print('data_split start...')

    msk = np.random.rand(len(all_data)) < spitPercent
    train_only = all_data[msk]
    test_and_dev = all_data[~msk]


    msk_test = np.random.rand(len(test_and_dev)) <0.5
    test_only = test_and_dev[msk_test]
    dev_only = test_and_dev[~msk_test]

    dev_only.to_csv(os.path.join(data_dir, 'TrainingData/dev.csv'))
    test_only.to_csv(os.path.join(data_dir, 'TrainingData/test.csv'))
    train_only.to_csv(os.path.join(data_dir, 'TrainingData/train.csv'))

    print('data_split_end...')

    return train_only, test_only, dev_only
#%%
################################### Glove Vector  ########################################################################
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r',encoding='latin-1')
    model = {}
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = embedding
        except:
            print (word)
            continue

    print ("Done.",len(model)," words loaded!")
    return model
#%%


#%%
################################### Create Vocab subset GLove vectors ########################################################################

def word_vec_index(training_data, glove_model):

    sentences = training_data['tweet'] # get the phrases as a df series
    #sentences = sentences[0:100]
    sentences_concat = sentences.str.cat(sep=' ')
    sentence_words = re.findall(r'\S+', sentences_concat)
    sentence_words_lwr = [x.lower() for x in sentence_words]
    subdict = {word: glove_model[word] for word in glove_model.keys() & sentence_words_lwr}

    vocab_df = pd.DataFrame(subdict)
    vocab_df.to_csv(vocab_path)
    return vocab_df
#%%
################################### Convertdf to list ########################################################################
def word_list(vocab_df):

    wordVectors = vocab_df.values.T.tolist()
    wordVectors_np = np.array(wordVectors)
    wordList = list(vocab_df.columns.values)

    return wordList, wordVectors_np
 #%%
################################### tensorflow data pipeline ########################################################################


def maxSeqLen(training_data):
    print('maxSeqlen start...')

    total_words = 0
    sequence_length = []
    idx = 0
    i=0
    bar = pb.ProgressBar(maxval=1600000, \
    widgets=[pb.Bar('=', '[', ']'), ' ', pb.Percentage()])
    bar.start()
    for index, row in training_data.iterrows():

        sentence = (row['tweet'])
        sentence_words = sentence.split()
        len_sentence_words = len(sentence_words)
        total_words = total_words + len_sentence_words

        # get the length of the sequence of each training data
        sequence_length.append(len_sentence_words)

        if idx == 0:
            max_seq_len = len_sentence_words


        if len_sentence_words > max_seq_len:
            max_seq_len = len_sentence_words
            print(sentence_words)
        idx = idx + 1
        bar.update(i+1)
        i+=1
        

    bar.finish()
    avg_words = total_words/index

    # convert to numpy array
    sequence_length_np = np.asarray(sequence_length)

    print('maxseqlen end...')

    return max_seq_len, avg_words, sequence_length_np

  #%%
def tf_data_pipeline(data, word_idx, weight_matrix, max_seq_len):

    #training_data = training_data[0:50]

    maxSeqLength = max_seq_len #Maximum length of sentence
    no_rows = len(data)
    ids = np.zeros((no_rows, maxSeqLength), dtype='int32')
    # conver keys in dict to lower case
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    idx = 0

    for index, row in data.iterrows():


        sentence = (row['tweet'])
        sentence_words = sentence.split(' ')

        i = 0
        for word in sentence_words:
            #print(index)
            word_lwr = word.lower()
            try:
                #print (word_lwr)
                ids[idx][i] =  word_idx_lwr[word_lwr]

            except Exception as e:
                #print (e)
                #print (word)
                if str(e) == word:
                    ids[idx][i] = 0
                continue
            i = i + 1
        idx = idx + 1
    return ids

  #%%
# create labels matrix for the rnn


def tf_data_pipeline_nltk(data, word_idx, weight_matrix, max_seq_len):

    #training_data = training_data[0:50]

    maxSeqLength = max_seq_len #Maximum length of sentence
    no_rows = len(data)
    ids = np.zeros((no_rows, maxSeqLength), dtype='int32')
    # conver keys in dict to lower case
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    idx = 0
    j=0
    bar = pb.ProgressBar(maxval=no_rows, \
    widgets=[pb.Bar('=', '[', ']'), ' ', pb.Percentage()])
    bar.start()

    for index, row in data.iterrows():


        sentence = (row['tweet'])
        #print (sentence)
        #tokenizer = RegexpTokenizer(r'\w+')
        #tokenizer = RegexpTokenizer('\s+', gaps=True)
        #sentence_words = tokenizer.tokenize(sentence)
        sentence_words = sentence.split()
        #print (sentence_words)
        i = 0
        for word in sentence_words:
            #print(index)
            word_lwr = word.lower()
            try:
                #print (word_lwr)
                ids[idx][i] =  word_idx_lwr[word_lwr]

            except Exception as e:
                #print (e)
                #print(word)
                if str(e) == word:
                    ids[idx][i] = 0
                continue
            i = i + 1
        idx = idx + 1
        bar.update(j+1)
        j+=1
    
    bar.finish()

    return ids


def labels_matrix(data):

    labels = data['polarity']

    #lables_float = labels.astype(float)
    lables_int = labels.astype(int)

    #cats = ['0','1','2','3','4','5','6','7','8','9']
    cats = ['0','1','2','3','4']
    #labels_mult = (lables_float * 10).astype(int)
    labels_mult = lables_int
    dummies = pd.get_dummies(labels_mult, prefix='', prefix_sep='')
    dummies = dummies.T.reindex(cats).T.fillna(0)
    #print(dummies)
    labels_matrix = dummies.to_numpy()

    return labels_matrix


def labels_matrix_unmod(data):

    labels = data['polarity']

    lables_float = labels.astype(float)

    labels_mult = (lables_float * 10).astype(int)
    labels_matrix = labels_mult.as_matrix()

    return labels_matrix

#%%
################################### Run Steps ########################################################################
def main():

    # Load the Trainign data
    all_data = read_data(path)
    #%%
    training_data = pd.read_csv(train_data_path, encoding='iso-8859-1')

    # use the below to split the training, validation and test
    train_df = training_data_split(training_data)
    #%%

    # Load glove vector
    glove_model = filter_glove(gloveFile)

    # Get glove vector subset for training vocab
    vocab_df = word_vec_index(all_data, glove_model)
    glove_model = None

    #Run this after the first iteration of obtaining the vocab df instead of above 2 steps
    vocab_df = pd.read_csv(vocab_path, encoding='iso-8859-1')

    #Get Wordlist and word vec lists from the df for the training Vocab
    wordList, wordVectors = word_list(vocab_df)
    wordVectors_df = pd.DataFrame(wordVectors)
    wordVectors_df.to_csv(wordVectors_path)

    # get the index of the word vec for each sentences to be input to the tf algo
    max_seq_len, avg_len, sequence_length = maxSeqLen(training_data)
    sequence_length_df = pd.DataFrame(sequence_length)
    sequence_length_df.to_csv(sequence_len_path)

    # training data input matrix
    sentence_matrix = tf_data_pipeline(training_data, wordList, wordVectors, max_seq_len)

    # export the sentence matrix to a csv file for easy load for next iterations
    sentence_matrix_df = pd.DataFrame(sentence_matrix)
    sentence_matrix_df.to_csv(sent_matrix_path)

    #################################################################### validation data set ############################################################
    # load validation data
    val_data = pd.read_csv(val_data_path, encoding='iso-8859-1')

    # load glove model and generat vocab for validation data
    glove_model = loadGloveModel(gloveFile)
    vocab_df_val = word_vec_index(val_data, glove_model)
    glove_model = None
    wordList_val, wordVectors_val = word_list(vocab_df_val)

    # get max length for val data
    max_seq_len_val, avg_len_val, sequence_length_val = maxSeqLen(val_data)
    sequence_length_val_df = pd.DataFrame(sequence_length_val)
    sequence_length_val_df.to_csv(sequence_len_val_path)

    # get the id matrix for val data
    sentence_matrix_val = tf_data_pipeline(val_data, wordList_val, wordVectors_val, max_seq_len)

    # write the val dat to csv
    sentence_matrix_df_val = pd.DataFrame(sentence_matrix_val)
    sentence_matrix_df_val.to_csv(sent_matrix_path_val)

    #################################################################### Test data set ############################################################
    # load test data
    test_data = pd.read_csv(test_data_path, encoding='iso-8859-1')

    # load glove model and generat vocab for test data
    glove_model = loadGloveModel(gloveFile)
    vocab_df_test = word_vec_index(val_data, glove_model)
    glove_model = None
    wordList_test, wordVectors_test = word_list(vocab_df_test)

    # get max length for test data
    max_seq_len_test, avg_len_test, sequence_length_test = maxSeqLen(test_data)
    sequence_length_test_df = pd.DataFrame(sequence_length_test)
    sequence_length_test_df.to_csv(sequence_len_test_path)

    # get the id matrix for test data
    sentence_matrix_test = tf_data_pipeline(test_data, wordList_test, wordVectors_test, max_seq_len_test)

    # write the test dat to csv
    sentence_matrix_df_test= pd.DataFrame(sentence_matrix_test)
    sentence_matrix_df_test.to_csv(sent_matrix_path_test)
