In [1]:
from os import listdir
from os.path import isfile, join
import pickle
import re
import random
import numpy as np
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.layers.core import Dense
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.callbacks import TensorBoard

Using TensorFlow backend.


#### Helper function to pickle and load stuff

In [2]:
def __pickleStuff(filename, stuff):
    save_stuff = open(filename, "wb")
    pickle.dump(stuff, save_stuff)
    save_stuff.close()
def __loadStuff(filename):
    saved_stuff = open(filename,"rb")
    stuff = pickle.load(saved_stuff)
    saved_stuff.close()
    return stuff

# Load the data
**phrases_embed.csv** came from [Babylon blog "How the chatbot understands sentences"](https://blog.babylonhealth.com/how-the-chatbot-understands-sentences-fe6c5deb6e81).

Checkout the data visualization [here](http://s3-eu-west-1.amazonaws.com/nils-demo/phrases.html).

In [3]:
import pandas as pd
df=pd.read_csv("./data/phrases_embed.csv")
df = df[["Disease", "class"]]
df.head(3)

Unnamed: 0,Disease,class
0,stomach ache,abdomen
1,I am having stomach pains with diahorrea and t...,abdomen
2,stomach pain,abdomen


### Get those two columns to numpy arrays pairs
"Disease" columns ==> documents

"class" columns ==> body_positions

In [4]:
documents=df.as_matrix(columns=df.columns[0:1])
documents = documents.reshape(documents.shape[0])
print("documents.shape: {}".format(documents.shape))
body_positions=df.as_matrix(columns=df.columns[1:])
body_positions = body_positions.reshape(body_positions.shape[0])
print("body_positions.shape: {}".format(body_positions.shape))

documents.shape: (1261,)
body_positions.shape: (1261,)


## Clean up the data
Function to clean up data

In [5]:
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
def cleanUpSentence(r, stop_words = None):
    r = r.lower().replace("<br />", " ")
    r = re.sub(strip_special_chars, "", r.lower())
    if stop_words is not None:
        words = word_tokenize(r)
        filtered_sentence = []
        for w in words:
            if w not in stop_words:
                filtered_sentence.append(w)
        return " ".join(filtered_sentence)
    else:
        return r

### Here we are cleaning up the data

In [6]:
totalX = []
totalY = []
stop_words = set(stopwords.words("english"))
for i, doc in enumerate(documents):
    totalX.append(cleanUpSentence(doc, stop_words))
    body_positions[i] = re.sub(strip_special_chars, "", body_positions[i].lower())
    totalY.append(body_positions[i])

#### Show input max sequence length
If the max input sequence length is too long, we can put a limit to it in order to reduce the training time.

In [7]:
xLengths = [len(word_tokenize(x)) for x in totalX]
h = sorted(xLengths)  #sorted lengths
maxLength =h[len(h)-1]
print("max input length is: ",maxLength)

max input length is:  18


## Convert input words to ids
**max_vocab_size**: the maximum number of words to keep, we choose 30000 since it is big enough to keep all words in this case.

Pad each input sequence to max input length **maxLength** if it is shorter.

In [8]:
max_vocab_size = 30000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(totalX)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)
totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(totalX), maxlen=maxLength))

input_vocab_size: 910


Take a look at one sequence in **totalX**, sequence has length 18, each number here represent a unique word. "0" is padding.

In [9]:
totalX[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        41,  40, 465, 206,  29])

### Save the input tokenizer
Since we need to use the same tokenizer for predition.

In [10]:
__pickleStuff("./data/input_tokenizer.p",input_tokenizer)

## Convert output words to ids

In [11]:
totalY[0:3]

['abdomen', 'abdomen', 'abdomen']

In [13]:
target_tokenizer = Tokenizer(30)
target_tokenizer.fit_on_texts(totalY)
target_vocab_size = len(target_tokenizer.word_index) + 1
totalY = np.array(target_tokenizer.texts_to_sequences(totalY)) -1
totalY = totalY.reshape(totalY.shape[0])

In [14]:
print("target_vocab_size:",target_vocab_size)

target_vocab_size: 19


In [76]:
totalY[0:3]

array([1, 1, 1])

## Turn output to categories(one-hot vectors)

In [77]:
totalY = to_categorical(totalY, num_classes=target_vocab_size) # turn output to one-hot vecotrs

In [78]:
totalY[0:3]

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.]])

In [80]:
vocab_size = input_vocab_size # vocab_size for model word embeding input
output_dimen = totalY.shape[1] # number of unique output classes

**target_reverse_word_index** to turn class ids to text

In [84]:
target_reverse_word_index = {v: k for k, v in list(target_tokenizer.word_index.items())}
target_reverse_word_index[2]

'abdomen'

## Save meta data for later predition
maxLength: the input sequence length

vocab_size: Input vocab size

output_dimen: number of unique output classes

target_reverse_word_index: turn predicted class ids to text

In [87]:
metaData = {"maxLength":maxLength,"vocab_size":vocab_size,"output_dimen":output_dimen,"target_reverse_word_index":target_reverse_word_index}
__pickleStuff("./data/metaData_triage.p", metaData)

## Build the Model, train and save it
The training data is logged to Tensorboard, we can look at it by cd into directory 

"./Graph/medical_triage" and run


"python -m tensorflow.tensorboard --logdir=."

In [90]:
embedding_dim = 256
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim,input_length = maxLength))
# Each input would have a size of (maxLengthx256) and each of these 256 sized vectors are fed into the GRU layer one at a time.
# All the intermediate outputs are collected and then passed on to the second GRU layer.
model.add(GRU(256, dropout=0.9, return_sequences=True))
# Using the intermediate outputs, we pass them to another GRU layer and collect the final output only this time
model.add(GRU(256, dropout=0.9))
# The output is then sent to a fully connected layer that would give us our final output_dim classes
model.add(Dense(output_dimen, activation='softmax'))
# We use the adam optimizer instead of standard SGD since it converges much faster
tbCallBack = TensorBoard(log_dir='./Graph/medical_triage', histogram_freq=0,
                            write_graph=True, write_images=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(totalX, totalY, validation_split=0.1, batch_size=32, epochs=40, verbose=1, callbacks=[tbCallBack])
model.save('./data/triage.HDF5')

print("Saved model!")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 18, 256)           232960    
_________________________________________________________________
gru_5 (GRU)                  (None, 18, 256)           393984    
_________________________________________________________________
gru_6 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dense_3 (Dense)              (None, 19)                4883      
Total params: 1,025,811
Trainable params: 1,025,811
Non-trainable params: 0
_________________________________________________________________
Train on 1134 samples, validate on 127 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/4

### Below prediction code
Function to load the meta data and the model we just trained.

In [99]:
model = None
target_reverse_word_index = None
maxLength = 0
def loadModel():
    global model, target_reverse_word_index, maxLength
    metaData = __loadStuff("./data/metaData_triage.p")
    maxLength = metaData.get("maxLength")
    vocab_size = metaData.get("vocab_size")
    output_dimen = metaData.get("output_dimen")
    target_reverse_word_index = metaData.get("target_reverse_word_index")
    embedding_dim = 256
    if model is None:
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_dim, input_length=maxLength))
        # Each input would have a size of (maxLength x 256) and each of these 256 sized vectors are fed into the GRU layer one at a time.
        # All the intermediate outputs are collected and then passed on to the second GRU layer.
        model.add(GRU(256, dropout=0.9, return_sequences=True))
        # Using the intermediate outputs, we pass them to another GRU layer and collect the final output only this time
        model.add(GRU(256, dropout=0.9))
        # The output is then sent to a fully connected layer that would give us our final output_dim classes
        model.add(Dense(output_dimen, activation='softmax'))
        # We use the adam optimizer instead of standard SGD since it converges much faster
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.load_weights('./data/triage.HDF5')
        model.summary()
    print("Model weights loaded!")

## Functions to convert input sentence to model input, and predict result

In [100]:
def findFeatures(text):
    textArray = [text]
    input_tokenizer = __loadStuff("./data/input_tokenizer.p")
    textArray = np.array(pad_sequences(input_tokenizer.texts_to_sequences(textArray), maxlen=maxLength))
    return textArray
def predictResult(text):
    global model, target_reverse_word_index
    if model is None:
        print("Please run \"loadModel\" first.")
        return None
    features = findFeatures(text)
    predicted = model.predict(features)[0]
    predicted = np.array(predicted)
    probab = predicted.max()
    predition = target_reverse_word_index[predicted.argmax()+1]
    return predition, probab

In [101]:
loadModel()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 18, 256)           232960    
_________________________________________________________________
gru_7 (GRU)                  (None, 18, 256)           393984    
_________________________________________________________________
gru_8 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dense_4 (Dense)              (None, 19)                4883      
Total params: 1,025,811
Trainable params: 1,025,811
Non-trainable params: 0
_________________________________________________________________
Model weights loaded!


In [111]:
predictResult("Skin is quite itchy.")

('dermatology', 0.99926859)

In [112]:
predictResult("Sore throat fever fatigue.")

('mouthface', 0.99999321)

In [113]:
predictResult("Lower back hurt, so painful.")

('back', 0.99995661)

In [114]:
predictResult("Very painful with period.")

('sexualhealth', 0.99996245)

In [115]:
predictResult("Sudden abdominal pain.")

('abdomen', 0.99999595)