# NLP Model_Text Multi-class Classification

## Part 2: Build the NLP Model with Pre-trained Glove word embeddings and LSTM

### Import Library

In [1]:
# Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Bidirectional,Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

# Other
import re
import string
import numpy as np
import pandas as pd

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1. Glove word embeddings 

### Load pre-trained Glove word embeddings

In [506]:
glove_6B_100d_file_path_name = "glove.6B/glove.6B.100d.txt"

embeddings_index = dict()

f = open(glove_6B_100d_file_path_name)

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [20]:
## windows system version: need to use io.open
# embeddings_index = dict()

# with io.open(glove_6B_100d_file_path_name, "r", encoding="utf-8") as f:

#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype='float32')
#         embeddings_index[word] = coefs

### List all glove words
When fit the tokenizer, use all golove words to fit. 

In [670]:
# list all golve words
all_glove_words = list(embeddings_index.keys())

### Tokenizer

Tokenizer is from Keras.
After fit on all glove words, tokenizer vocabulary size is 400k words.

In [671]:
# Tokenize
vocabulary_size = len(all_glove_words)
tokenizer = Tokenizer() #num_words= vocabulary_size
tokenizer.fit_on_texts(all_glove_words) 

### Pickle the tokenizer
Using pickled tokenizer when deploy to the cloud web service.

In [None]:
# import pickle

# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Creat Embedding Matrix

In [673]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size, 100)) 
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
        

In [678]:
# tokenizer.word_index.items()

In [679]:
print(embedding_matrix.shape)
# embedding_matrix[78]

(400001, 100)


## 2. NLP Model for Multi-classification
Categorize the given text

### Load training data

In [None]:
# training 
path_filename_for_training_data = "train_data.csv"
data_df = pd.read_csv(path_filename_for_training_data)

### Define class labels and turn it to categorical value using keras function.

In [None]:
labels = data_df['label']

num_classes = 17
labels_cat = keras.utils.to_categorical(np.array(labels), num_classes)

### Tokenize the text from training set

In [None]:
maxlen = 10
sequences = tokenizer.texts_to_sequences(data_df['phrase'])
data = pad_sequences(sequences, maxlen=maxlen)

### Building Model

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, output_dim=100, input_length=maxlen, weights=[embedding_matrix], trainable=False))
#model_glove.add(Dropout(0.2))
#model_glove.add(Conv1D(64, 2, activation='relu')) 
model_glove.add(LSTM(64))
model_glove.add(Dense(17, activation='softmax'))

model_glove.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# training the model
model_glove.fit(data, labels_cat, validation_split=0.3, epochs = 2)

### Pickle the model
Using pickled model when deploy to the cloud web service.

In [None]:
# import pickle

# pickle.dump(model_glove, open("model_600.pkl","wb"))
# print("Successfully pickled.")

### Prediction
Before put the test text into model, several steps of preprocessing is needed. 

In [None]:
text = "To receive retirement income of $100,000 per annum, and have your money last until life expectancy."

In [None]:
# Convert to lowercase
text = text.lower()    
# remove special characters
text = re.sub(r'[?|!|\'|"|#|,|)|(|\|/$%\n\t.:;""‘’]',r'',text)

# split text to word
word_list = text.split(' ')
len_text = len(word_list)

# combine the words to make short phrases with 6~10 words (actually 6,8,10)
minlen = 6
maxlen = 10

phrases = []   
len_each_phrase = list(set([minlen,(maxlen + minlen)//2, maxlen])) 

i = 0
while i <= len_text:
    for nword in len_each_phrase:    
        if i+nword >= len_text:
            phrase = ' '.join(word_list[i:])
            phrases.append(phrase)
            i = len_text
            break

        else:
            phrase = ' '.join(word_list[i:i+nword])
            phrases.append(phrase)
    i += 1
'''
Output is list of phrases 
Cut all possible phrases from the text with window size of minlen ~ maxlen. 
'''

### Tokenize the text (list of phrases)

In [None]:
# tokenize and padding
sequences = tokenizer.texts_to_sequences(phrases)
test_data = pad_sequences(sequences, maxlen = maxlen)

### Predict with model

In [None]:
# predict 
preds = model_glove.predict(test_data)

### Convert the predicted probability to category

In [None]:
prob = 0.5

preds_df = pd.DataFrame(preds)

'''
For each class, count the number of the phrases that the probability is higher than threshhold 'prob'.
Choose the max as the predicted category. 
'''
count = preds_df[preds_df > prob].count()

if sum(count[1:] > 0) > 0:
    result = count.idxmax()
else:
    result = 0

# result is the category
result

### (optional) Defined a function to predict class
Easy to explore test result using function.

In [None]:
def test(text):
    # test data
    prob = 0.5
    minlen = 6
    maxlen = 10

    text = text.lower()     # Converting to lowercase
    text = re.sub(r'[?|!|\'|"|#|,|)|(|\|/$%\n\t.:;""‘’]',r'',text)

    # split text to word
    word_list = text.split(' ')
    len_text = len(word_list)
    
    phrases = []   
    len_each_phrase = list(set([minlen,(maxlen + minlen)//2, maxlen]))    

    i = 0
    while i <= len_text:
        for nword in len_each_phrase:    
            if i+nword >= len_text:
                phrase = ' '.join(word_list[i:])
                phrases.append(phrase)
                i = len_text
                break
                
            else:
                phrase = ' '.join(word_list[i:i+nword])
                phrases.append(phrase)
        i += 1

    sequences = tokenizer.texts_to_sequences(phrases)
    test_data = pad_sequences(sequences, maxlen = maxlen)
   
    # predict with model
    preds = model_glove.predict(test_data)
    
    preds_df = pd.DataFrame(preds)

    count = preds_df[preds_df > prob].count()
    
    if sum(count[1:] > 0) > 0:
        result = count.idxmax()
    else:
        result = 0

    return result,preds_df, count

In [None]:
result,preds_df, count = test(text)