In [15]:
import os
import numpy as np
import time
from typing import Tuple, List, Dict
import tensorflow as tf

from createdataset import *

In [16]:
def data(subset='pku',padding=50):
    '''function that creates a dataset -- training, dev, and test
    args: subset: any subset, padding: padding size
    returns: (X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev
    '''
    
    type_ = "training"
    print(type_)
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, None)
    X_train, y_train, info_train, word_to_index_training = A.DateGen()
    print("X shape: {}\ny shape: {}".format(X_train.shape, y_train.shape))
    print(info_train)
    
    type_ = 'dev'
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, word_to_index_training)
    X_dev, y_dev, info_dev, _ = A.DateGen()
    print(type_)
    print("X shape: {}\ny shape: {}".format(X_dev.shape, y_dev.shape))
    print(info_dev)
    
    type_ = 'testing'
    Label_file, Input_file = ChooseDataset(type_, subset)
    A = CreateDataset(Label_file, Input_file, padding, type_, word_to_index_training)
    X_test, y_test, info_test, _ = A.DateGen()
    print(type_)
    print("X shape: {}\ny shape: {}".format(X_test.shape, y_test.shape))
    print(info_dev)
    
    return (X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev


(X_train, y_train), (X_dev, y_dev), (X_test, y_test), info_dev = data(subset='pku',padding=10)

training
X shape: (19056, 10)
y shape: (19056, 10, 4)
{'MAXLEN': 10, 'VocabSize': 285201}
dev
X shape: (1945, 10)
y shape: (1945, 10, 4)
{'MAXLEN': 10, 'VocabSize': 285201}
testing
X shape: (1945, 10)
y shape: (1945, 10, 4)
{'MAXLEN': 10, 'VocabSize': 285201}


## Model parameters

In [17]:
#DEFINE SOME COSTANTS
VOCAB_SIZE = info_dev['VocabSize']
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256
PADDING_SIZE = info_dev['MAXLEN']
batch_size = 128
epochs = 10

In [18]:
model_name = time.strftime('%Y-%m-%d_%H:%M:%S_%z')
def create_keras_model(vocab_size, embedding_size, hidden_size, PADDING_SIZE):
    print("Creating KERAS model")
    model = K.models.Sequential()
    model.add(K.layers.Embedding(vocab_size, embedding_size, mask_zero=True, input_length = PADDING_SIZE))
    model.add(K.layers.Bidirectional(
              K.layers.LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
    model.add(K.layers.TimeDistributed(
              K.layers.Dense(4, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

    return model

model = create_keras_model(VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, PADDING_SIZE)
model.summary()

Creating KERAS model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 32)            9126432   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 512)           591872    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 4)             2052      
Total params: 9,720,356
Trainable params: 9,720,356
Non-trainable params: 0
_________________________________________________________________


In [19]:
cbk = K.callbacks.TensorBoard('../resources/logging/keras_model_'+model_name)
print("\nStarting training...")
K.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              verbose=0, mode='auto')


Starting training...


<tensorflow.python.keras.callbacks.EarlyStopping at 0x1351a4b00>

In [20]:
csv_logger = K.callbacks.CSVLogger('../resources/logging/keras_model_'+model_name+'.log')
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
          shuffle=True, validation_data=(X_dev, y_dev), callbacks=[cbk, csv_logger]) 
print("Training complete.\n")

Train on 19056 samples, validate on 1945 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


KeyboardInterrupt: 

In [13]:
if not os.path.exists('../resources/models'):
    os.mkdir('../resources/models')
rel_path = '../resources/models'
weights = os.path.join(rel_path,'model_weights_'+model_name+'.h5')
model_name_save = os.path.join(rel_path,'model_'+model_name+'.h5')
model.save_weights(weights) #saving weights for further analysis
model.save(model_name_save)

In [14]:
print("\nEvaluating test...")
loss_acc = model.evaluate(X_test, y_test, verbose=3)
print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))


Evaluating test...
Test data: loss = 4.450741  accuracy = 6.05% 
