In [33]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

In [34]:
#load clean dataset
def load_dataset(filename):
    return load(open(filename,'rb'))

In [35]:
#fit a tokenizer
def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [36]:
#calculate the maximum document length
def max_length(lines):
    return max([len(s.split())for s in lines])

In [37]:
#encode a list of lines
def encode_text(tokenizer,lines,length):
    #integer encode
    encoded=tokenizer.texts_to_sequences(lines)
    #pad encoded sequences
    padded=pad_sequences(encoded,maxlen=length,padding='post')
    return padded



In [38]:
#define the model
def define_model(length,vocab_size):
    #channel 1
    inputs1=Input(shape=(length,))
    embedding1=Embedding(vocab_size,100)(inputs1)
    conv1=Conv1D(filters=32,kernel_size=4,activation='relu')(embedding1)
    drop1=Dropout(0.5)(conv1)
    pool1=MaxPooling1D(pool_size=2)(drop1)
    flat1=Flatten()(pool1)
    #channel2
    inputs2=Input(shape=(length,))
    embedding2=Embedding(vocab_size,100)(inputs2)
    conv2=Conv1D(filters=32,kernel_size=6,activation='relu')(embedding2)
    drop2=Dropout(0.5)(conv2)
    pool2=MaxPooling1D(pool_size=2)(drop2)
    flat2=Flatten()(pool2)
    #channel3
    inputs3=Input(shape=(length,))
    embedding3=Embedding(vocab_size,100)(inputs3)
    conv3=Conv1D(filters=32,kernel_size=6,activation='relu')(embedding3)
    drop3=Dropout(0.5)(conv3)
    pool3=MaxPooling1D(pool_size=2)(drop3)
    flat3=Flatten()(pool3)
    
    #merge
    merged=concatenate([flat1,flat2,flat3])
    
    #interpretation
    dense1=Dense(10,activation='relu')(merged)
    outputs=Dense(1,activation='sigmoid')(dense1)
    model=Model(inputs=[inputs1,inputs2,inputs3],outputs=outputs)
    
    #compile
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    #summarize
    model.summary()
    plot_model(model,show_shapes=True,to_file='model.png')
    return model
    

In [39]:
#load training dataset
trainLines,trainLabels=load_dataset('train.pkl')
#create tokenizer
tokenizer=create_tokenizer(trainLines)
#calculate max document length
length=max_length(trainLines)
print('Max doc length:%d' % length)

#calculate vocabulary size
vocab_size=len(tokenizer.word_index)+1
print('voc size:%d' %vocab_size)

Max doc length:1438
voc size:73604


In [40]:
#encode data
trainX=encode_text(tokenizer,trainLines,length)
#define model
model=define_model(length,vocab_size)
#fit model
model.fit([trainX,trainX,trainX],array(trainLabels),epochs=7,batch_size=16)
#save the model
model.save('model.h5')

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1438)]       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 1438)]       0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 1438)]       0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 1438, 100)    7360400     input_11[0][0]                   
_______________________________________________________________________________________