In [1]:
import os
import time
import json
import pickle
import timeit
import random
import param
import shutil
import collections
import numpy as np
import tensorflow as tf

import stctrain
import datahelper
import stctokenizer
import nuggetdetection as ND
import dialogquality as DQ
import dialogquality_ndfeature as DQNDF
import stcevaluation as STCE

from scipy import stats
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(level=logging.ERROR)

Using TensorFlow backend.


In [2]:
doclen = param.doclen
embsize = param.embsize
max_sent = param.max_sent
NDclasses = param.NDclasses
DQclasses = param.DQclasses
sentembsize = param.sentembsize

REMOVE_STOPWORDS = False
TO_LOWER = True
TOKEN_TYPE = 'nltk'
EMB = 'stc' # glove or stc

datahelper = datahelper.DataHelper(embedding_path="../embedding/STCWiki/STCWiki_mincount0.model.bin")
stctokenizer = stctokenizer.STCTokenizer()

## Word Embedding

In [3]:
# corpus = datahelper.prepare_word_embedding_corpus(
#     '../data/text8', 
#     TOKEN_TYPE, 
#     REMOVE_STOPWORDS, 
#     TO_LOWER,
# )

# wordemb_model = Word2Vec(corpus, size=100, min_count=0, workers=4, iter=30, sg=1, window=5)
# word_vectors = wordemb_model.wv

In [4]:
# word_vectors.save("../embedding/STCWiki/STCWiki_mincount0.model.bin")
# datahelper.set_word_vectors(word_vectors)

## Read Corpus & Prepare data for model

In [5]:
trainX, _, trainND, trainDQ, train_turns, train_masks = datahelper.get_model_train_data(
    'train',
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=False,
)

devX, _, devND, devDQ, dev_turns, dev_masks = datahelper.get_model_train_data(
    'dev',
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=False,
)

testX, _, test_turns, test_masks = datahelper.get_model_test_data(
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=False,
)

testIDs = datahelper.testIDs
trainDQA = [item['A'] for item in trainDQ]
trainDQS = [item['S'] for item in trainDQ]
trainDQE = [item['E'] for item in trainDQ]
devDQA = [item['A'] for item in devDQ]
devDQS = [item['S'] for item in devDQ]
devDQE = [item['E'] for item in devDQ]

dataND = [trainX, trainND, train_turns, train_masks, devX, devND, dev_turns, dev_masks, testX, test_turns, test_masks]
dataDQA = [trainX, trainDQA, train_turns, devX, devDQA, dev_turns, testX, test_turns]
dataDQS = [trainX, trainDQS, train_turns, devX, devDQS, dev_turns, testX, test_turns]
dataDQE = [trainX, trainDQE, train_turns, devX, devDQE, dev_turns, testX, test_turns]

dataDQA_NDF = [trainX, trainDQA, train_turns, trainND, devX, devDQA, dev_turns, devND, testX, test_turns]
dataDQE_NDF = [trainX, trainDQE, train_turns, trainND, devX, devDQE, dev_turns, devND, testX, test_turns]
dataDQS_NDF = [trainX, trainDQS, train_turns, trainND, devX, devDQS, dev_turns, devND, testX, test_turns]

In [6]:
es = 3
fixed_paramsND  = {
    'epoch':100, 
    'early_stopping':es, 
    'batch_size':30,
    'lr':5e-4,
    'kp':1, 
    'hiddens':1024, 
    'Fsize':[2,3], 
    'gating':False, 
    'bn':True, 
    'method':ND.CNNRNN,
} 

fixed_paramsDQ = {
    'epoch':100, 
    'early_stopping':es, 
    'batch_size':40, 
    'lr':5e-4, 
    'kp':1, 
    'hiddens':1024, # 1024 for gating, 2048 for no gating
    'Fsize':[2,2], # [2,2] for gating, [2,3] for no gating
    'gating':True, 
    'bn':True, 
}

In [7]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def show_train_history(title, train, valid, earlystop=es):
    epoch = len(train)
    best = epoch-earlystop
    x = [i for i in range(1, epoch + 1)]
    plt.figure(figsize=(5,12))
    ax = plt.figure().gca()
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.rcParams.update({'font.size': 18})
    plt.plot(x, train, marker='o', linestyle='-', color='b')
    plt.plot(x, valid, marker='o', linestyle='-', color='r')
    plt.axvline(best, color='black')
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Training Loss', 'Validation Loss'], loc='upper right')
    plt.show()

## Test ND

In [8]:
BEST_PATH = 'PickleResult/'
# bestND = pickle.load(open(BEST_PATH + 'bestND.p', "rb"))
bestDQAs = pickle.load(open(BEST_PATH + 'memoryDQAs.p', "rb"))
bestDQSs = pickle.load(open(BEST_PATH + 'memoryDQSs.p', "rb"))
bestDQEs = pickle.load(open(BEST_PATH + 'memoryDQEs.p', "rb"))

In [9]:
e = True
for mr in [None, 'Bi-GRU', 'Bi-LSTM']:
    for fn in [[256,512,1024]]:
        for num_layers in [2]:
            testname = 'ND_{}_mapw2v'.format(mr)
            testND, train_losses, dev_losses = stctrain.start_trainND(
                *dataND, 
                **fixed_paramsND,
                Fnum=fn, num_layers=num_layers, memory_rnn_type=mr,
                evaluate=e,
            )

            show_train_history(testname, train_losses, dev_losses)
            datahelper.pred_to_submission(testND, bestDQAs[0], bestDQSs[0], bestDQEs[0], test_turns, testIDs, filename='{}.json'.format(testname))


x.shape (?, 7, 150, 100)
epoch: 1 | trainloss: 0.9164 | devloss: 0.9123
epoch: 2 | trainloss: 0.7910 | devloss: 0.8021


KeyboardInterrupt: 

In [None]:
# Save the result
# pickle.dump(testND, open('bestND190116.p', 'wb'))

In [None]:
stop

## Test DQ

In [None]:
memoryNDs = pickle.load(open('PickleResult/memoryNDs.p', 'rb'))

In [None]:
e = True
method = DQ.CNNCNN
for l in [1]:
    for rm in ['Bi-LSTM']:
        for fn in [[512, 1024]]:
            testname = 'ND_trainsize_{}perc'.format(prop*10)
            print(testname, 'is started')
        
            bestDQA, train_lossesA, dev_lossesA = stctrain.start_trainDQ(
                *dataDQA, 
                **fixed_paramsDQ, scoretype='DQA', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )
            

            bestDQE, train_lossesE, dev_lossesE = stctrain.start_trainDQ(
                *dataDQE, 
                **fixed_paramsDQ, scoretype='DQE', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )

            bestDQS, train_lossesS, dev_lossesS = stctrain.start_trainDQ(
                *dataDQS, 
                **fixed_paramsDQ, scoretype='DQS', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e
            )
       
            datahelper.pred_to_submission(memoryNDs[0], bestDQA, bestDQS, bestDQE, test_turns, testIDs, filename='{}.json'.format(testname))
            

In [None]:
# Save the result
# pickle.dump(bestDQAs, open('memoryDQAs.p', 'wb'))
# pickle.dump(bestDQSs, open('memoryDQSs.p', 'wb'))
# pickle.dump(bestDQEs, open('memoryDQEs.p', 'wb'))

## Test NDF

In [8]:
def submission_to_pred(path='ReTesting/0220_wordemb_test/NoneMemory_3stackCNN_2stackRNN(best).json'):
    import json
    with open(path) as f:
        test_preds_json = json.load(f)
        
    pred = []
    
    for testID in testIDs:
        for test_pred_json in test_preds_json:
            _id = test_pred_json['id']
            if _id != testID:
                continue
            dialogue_nuggets = test_pred_json['nugget']
            dialogue_pred = [] 
            
            for utterance_nugget in dialogue_nuggets:
                utterance_pred = [None] * 7
                if len(utterance_nugget.keys()) == 4:
                    utterance_pred[0] = utterance_nugget['CNUG*']
                    utterance_pred[1] = utterance_nugget['CNUG']
                    utterance_pred[2] = utterance_nugget['CNaN']
                    utterance_pred[3] = utterance_nugget['CNUG0']
                    utterance_pred[4] = 0.
                    utterance_pred[5] = 0.
                    utterance_pred[6] = 0.
                elif len(utterance_nugget.keys()) == 3:
                    utterance_pred[0] = 0.
                    utterance_pred[1] = 0.
                    utterance_pred[2] = 0.
                    utterance_pred[3] = 0.
                    utterance_pred[4] = utterance_nugget['HNUG*']
                    utterance_pred[5] = utterance_nugget['HNUG']
                    utterance_pred[6] = utterance_nugget['HNaN']
                
                dialogue_pred.append(utterance_pred)
                
            while len(dialogue_pred) < 7:
                dialogue_pred.append([0] * 7)
                
            pred.append(dialogue_pred)
    
    return pred

In [9]:
testND = submission_to_pred()

In [10]:
testNDmasked = [np.multiply(nd, mask) for nd, mask in zip(testND, test_masks)]

In [11]:
dataDQA_NDF += [testNDmasked]
dataDQE_NDF += [testNDmasked]
dataDQS_NDF += [testNDmasked]

In [12]:
len(dataDQA_NDF)

11

In [14]:
method = DQNDF.CNNCNN
e = True
testND = np.asarray(testND)

# for prop in range(1, 11):
for mr in ['Bi-GRU']:
    for fnum in [[512, 1024]]:
        for num_layers in [1]:
            testname = 'MeHGCNN2'
            print(testname, 'is started')

            bestDQNDFA, train_lossesA, dev_lossesA = stctrain.start_trainDQ_NDF(
                *dataDQA_NDF, 
                **fixed_paramsDQ, scoretype='DQA', method=method,
                Fnum=fnum, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            bestDQNDFE, train_lossesE, dev_lossesE = stctrain.start_trainDQ_NDF(
                *dataDQE_NDF, 
                **fixed_paramsDQ, scoretype='DQE',
                Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            bestDQNDFS, train_lossesS, dev_lossesS = stctrain.start_trainDQ_NDF(
                *dataDQS_NDF, 
                **fixed_paramsDQ, scoretype='DQS', 
                Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            datahelper.pred_to_submission(testND, bestDQNDFA, bestDQNDFS, bestDQNDFE, test_turns, testIDs, filename='{}.json'.format(testname))

MeHGCNN2 is started
CNNCNN|9|True|True|2_2|1024|512_1024|0.08269|0.12640
CNNCNN|6|True|True|2_2|1024|512_1024|0.09059|0.13083
CNNCNN|11|True|True|2_2|1024|512_1024|0.07394|0.12049


In [None]:
# Save the result
# pickle.dump(bestDQANDFs, open('memoryDQANDFs.p', 'wb'))
# pickle.dump(bestDQSNDFs, open('memoryDQSNDFs.p', 'wb'))
# pickle.dump(bestDQENDFs, open('memoryDQENDFs.p', 'wb'))