In [1]:
import os
import time
import json
import pickle
import timeit
import random
import param
import shutil
import collections
import numpy as np
import tensorflow as tf

import stctrain_bert
import datahelper
import stctokenizer
import nuggetdetectionBERT as ND
import dialogqualityBERT as DQ
import dialogquality_ndfeatureBERT as DQNDF
import stcevaluation as STCE

from scipy import stats
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(level=logging.INFO)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
doclen = param.doclen
embsize = param.embsize
max_sent = param.max_sent
NDclasses = param.NDclasses
DQclasses = param.DQclasses
sentembsize = param.sentembsize

REMOVE_STOPWORDS = False
TO_LOWER = True
TOKEN_TYPE = 'nltk'
EMB = 'stc' # glove or stc

datahelper = datahelper.DataHelper(embedding_path="../embedding/STCWiki/STCWiki_mincount0.model.bin")
stctokenizer = stctokenizer.STCTokenizer()

INFO:gensim.utils:loading Word2VecKeyedVectors object from ../embedding/STCWiki/STCWiki_mincount0.model.bin
INFO:gensim.utils:loading vectors from ../embedding/STCWiki/STCWiki_mincount0.model.bin.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loaded ../embedding/STCWiki/STCWiki_mincount0.model.bin


## Word Embedding

In [3]:
# corpus = datahelper.prepare_word_embedding_corpus(
#     '../data/text8', 
#     TOKEN_TYPE, 
#     REMOVE_STOPWORDS, 
#     TO_LOWER,
# )

# wordemb_model = Word2Vec(corpus, size=100, min_count=0, workers=4, iter=30, sg=1, window=5)
# word_vectors = wordemb_model.wv

In [4]:
# word_vectors.save("../embedding/STCWiki/STCWiki_mincount0.model.bin")
# datahelper.set_word_vectors(word_vectors)

## Read Corpus & Prepare data for model

In [5]:
def get_data():
    _, trainX, trainND, trainDQ, train_turns, train_masks = datahelper.get_model_train_data(
        'train',
        TOKEN_TYPE, 
        REMOVE_STOPWORDS, 
        TO_LOWER,
        EMB,
        bert=False,
    )

    _, devX, devND, devDQ, dev_turns, dev_masks = datahelper.get_model_train_data(
        'dev',
        TOKEN_TYPE, 
        REMOVE_STOPWORDS, 
        TO_LOWER,
        EMB,
        bert=False,
    )

    _, testX,  test_turns, test_masks = datahelper.get_model_test_data(
        TOKEN_TYPE, 
        REMOVE_STOPWORDS, 
        TO_LOWER,
        EMB,
        bert=False,
    )
    
    trainX = pickle.load(open('PickleBert/trainX_bert_512.p', 'rb'))
    devX = pickle.load(open('PickleBert/devX_bert_512.p', 'rb'))
    testX = pickle.load(open('PickleBert/testX_bert_512.p', 'rb'))

    testIDs = datahelper.testIDs
    trainDQA = [item['A'] for item in trainDQ]
    trainDQS = [item['S'] for item in trainDQ]
    trainDQE = [item['E'] for item in trainDQ]
    devDQA = [item['A'] for item in devDQ]
    devDQS = [item['S'] for item in devDQ]
    devDQE = [item['E'] for item in devDQ]

    dataND = [trainX, trainND, train_turns, train_masks, devX, devND, dev_turns, dev_masks, testX, test_turns, test_masks]
    dataDQA = [trainX, trainDQA, train_turns, devX, devDQA, dev_turns, testX, test_turns]
    dataDQS = [trainX, trainDQS, train_turns, devX, devDQS, dev_turns, testX, test_turns]
    dataDQE = [trainX, trainDQE, train_turns, devX, devDQE, dev_turns, testX, test_turns]
    
    dataDQA_NDF = [trainX, trainDQA, train_turns, trainND, devX, devDQA, dev_turns, devND, testX, test_turns]
    dataDQE_NDF = [trainX, trainDQE, train_turns, trainND, devX, devDQE, dev_turns, devND, testX, test_turns]
    dataDQS_NDF = [trainX, trainDQS, train_turns, trainND, devX, devDQS, dev_turns, devND, testX, test_turns]

    return dataND, dataDQA, dataDQE, dataDQS, dataDQA_NDF, dataDQE_NDF, dataDQS_NDF, testX, test_turns, test_masks, testIDs

In [6]:
dataND, dataDQA, dataDQE, dataDQS, dataDQA_NDF, dataDQE_NDF, dataDQS_NDF, testX, test_turns, test_masks, testIDs = get_data()

INFO:corpus word2vec:Unknown word: condition.
INFO:corpus word2vec:Unknown word: `
INFO:corpus word2vec:Unknown word: condition.
INFO:corpus word2vec:Unknown word: `
INFO:corpus word2vec:Unknown word: condition.
INFO:corpus word2vec:Unknown word: `
INFO:corpus word2vec:Training data unknown words count: 6
INFO:corpus word2vec:Training data max doclen: 150
INFO:corpus word2vec:Training data unknown words count: 0
INFO:corpus word2vec:Training data max doclen: 150
INFO:corpus word2vec:Testing data unknown words count: 0
INFO:corpus word2vec:Testing data max doclen: 150


In [7]:
print("X shape", np.array(dataND[0]).shape)
print("turns shape", np.array(dataND[2]).shape)
print("masks shape", np.array(dataND[3]).shape)

print("X Example")
print(dataND[0][0])

print("turns Example")
print(dataND[2][0])

print("masks Example")
print(dataND[3][0])

X shape (1337, 7, 1024)
turns shape (1337,)
masks shape (1337, 7, 7)
X Example
[[-0.38532966 -0.38864386 -0.42095938 ... -0.2239857   0.02487359
  -0.05700535]
 [-0.21663742 -0.62647951 -0.35346892 ...  0.38286296 -0.02151336
  -0.04521991]
 [-0.38483375 -0.20772332 -0.67306274 ...  0.08341745  0.0316062
   0.09944315]
 ...
 [-0.25072855 -0.63722098 -0.60703731 ... -0.02940212  0.14081407
  -0.38868326]
 [ 0.18198229 -0.45537627 -0.66125107 ...  0.09871428  0.47836649
   0.03941095]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
turns Example
6
masks Example
[[1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0.]]


In [8]:
es = 3
fixed_paramsND  = {
    'epoch':100, 
    'early_stopping':es, 
    'batch_size':30,
    'lr':5e-4,
    'kp':1, 
    'hiddens':1024, 
    'Fsize':[2,3], 
    'gating':False, 
    'bn':True, 
    'method':ND.CNNRNN,
} 

fixed_paramsDQ = {
    'epoch':50, 
    'early_stopping':es, 
    'batch_size':40, 
    'lr':5e-4, 
    'kp':1, 
    'hiddens':1024, 
    'Fsize':[2,2],
    'gating':True, 
    'bn':True, 
}

In [9]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def show_train_history(title, train, valid, earlystop=es):
    epoch = len(train)
    best = epoch-earlystop
    x = [i for i in range(1, epoch + 1)]
    plt.figure(figsize=(5,12))
    ax = plt.figure().gca()
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.rcParams.update({'font.size': 18})
    plt.plot(x, train, marker='o', linestyle='-', color='b')
    plt.plot(x, valid, marker='o', linestyle='-', color='r')
    plt.axvline(best, color='black')
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Training Loss', 'Validation Loss'], loc='upper right')
    plt.show()

## Test ND

In [10]:
BEST_PATH = 'PickleResult/'
# bestND = pickle.load(open(BEST_PATH + 'bestND.p', "rb"))
bestDQAs = pickle.load(open(BEST_PATH + 'memoryDQAs.p', "rb"))
bestDQSs = pickle.load(open(BEST_PATH + 'memoryDQSs.p', "rb"))
bestDQEs = pickle.load(open(BEST_PATH + 'memoryDQEs.p', "rb"))

In [11]:
e = True
for mr in [None]:
    for fn in [[256]]:
        for num_layers in [2]:
            testname = 'NDsubtask-BERT-LSTM'
            print(testname)
            testND, train_losses, dev_losses = stctrain_bert.start_trainND(
                *dataND, 
                **fixed_paramsND,
                Fnum=fn, num_layers=num_layers, memory_rnn_type=mr,
                evaluate=e
            )

    #             show_train_history(testname, train_losses, dev_losses)
            datahelper.pred_to_submission(testND, bestDQAs[0], bestDQSs[0], bestDQEs[0], test_turns, testIDs, filename='{}.json'.format(testname))

NDsubtask-BERT-LSTM
Start epoch 0
Start epoch 1
Start epoch 2
Start epoch 3
Start epoch 4
Start epoch 5
Start epoch 6
Start epoch 7
Start epoch 8
Start epoch 9
Start epoch 10
Start epoch 11
CNNRNN|12|False|True|2_3|1024|256|1|0.0261|0.0957
models/ND/NDsubtask.ckpt is saved

SavedModel


In [None]:
# pickle.dump(testND, open('bestND190116.p', 'wb'))

In [12]:
stop

NameError: name 'stop' is not defined

## Test DQ

In [10]:
memoryNDs = pickle.load(open('PickleResult/memoryNDs.p', 'rb'))

In [16]:
datahelper.pred_to_submission(testND, bestDQA, bestDQS, bestDQE, test_turns, testIDs, filename='{}.json'.format(testname))

NameError: name 'bestDQA' is not defined

In [None]:
e = True
method = DQ.CNNCNN
for l in [1]:
    for rm in ['Bi-GRU']:
        for fn in [[512, 1024]]:
            testname = 'DQsubtask-MeGCBERT'
            print(testname, 'is started')
        
            bestDQA, train_lossesA, dev_lossesA = stctrain_bert.start_trainDQ(
                *dataDQA, 
                **fixed_paramsDQ, scoretype='DQA', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )

            bestDQE, train_lossesE, dev_lossesE = stctrain_bert.start_trainDQ(
                *dataDQE, 
                **fixed_paramsDQ, scoretype='DQE', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )

            bestDQS, train_lossesS, dev_lossesS = stctrain_bert.start_trainDQ(
                *dataDQS, 
                **fixed_paramsDQ, scoretype='DQS', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e
            )

            
            datahelper.pred_to_submission(testND, bestDQA, bestDQS, bestDQE, test_turns, testIDs, filename='{}.json'.format(testname))
            

In [None]:
# pickle.dump(bestDQAs, open('memoryDQAs.p', 'wb'))
# pickle.dump(bestDQSs, open('memoryDQSs.p', 'wb'))
# pickle.dump(bestDQEs, open('memoryDQEs.p', 'wb'))

## Test NDF

In [11]:
def submission_to_pred(path='ReTesting/0220_wordemb_test/NoneMemory_3stackCNN_2stackRNN(best).json'):
    import json
    with open(path) as f:
        test_preds_json = json.load(f)
        
    pred = []
    
    for testID in testIDs:
        for test_pred_json in test_preds_json:
            _id = test_pred_json['id']
            if _id != testID:
                continue
            dialogue_nuggets = test_pred_json['nugget']
            dialogue_pred = [] 
            
            for utterance_nugget in dialogue_nuggets:
                utterance_pred = [None] * 7
                if len(utterance_nugget.keys()) == 4:
                    utterance_pred[0] = utterance_nugget['CNUG*']
                    utterance_pred[1] = utterance_nugget['CNUG']
                    utterance_pred[2] = utterance_nugget['CNaN']
                    utterance_pred[3] = utterance_nugget['CNUG0']
                    utterance_pred[4] = 0.
                    utterance_pred[5] = 0.
                    utterance_pred[6] = 0.
                elif len(utterance_nugget.keys()) == 3:
                    utterance_pred[0] = 0.
                    utterance_pred[1] = 0.
                    utterance_pred[2] = 0.
                    utterance_pred[3] = 0.
                    utterance_pred[4] = utterance_nugget['HNUG*']
                    utterance_pred[5] = utterance_nugget['HNUG']
                    utterance_pred[6] = utterance_nugget['HNaN']
                
                dialogue_pred.append(utterance_pred)
                
            while len(dialogue_pred) < 7:
                dialogue_pred.append([0] * 7)
                
            pred.append(dialogue_pred)
    
    return pred

In [12]:
testND = submission_to_pred()

In [13]:
testNDmasked = [np.multiply(nd, mask) for nd, mask in zip(testND, test_masks)]

In [14]:
dataDQA_NDF += [testNDmasked]
dataDQE_NDF += [testNDmasked]
dataDQS_NDF += [testNDmasked]

In [15]:
method = DQNDF.CNNCNN
e = True
testND = np.asarray(testND)

testnameORG = 'MeGCBERT'
for i in range(1):
    for mr in ['Bi-GRU']:
        for fnum in [[512, 1024]]:
            for num_layers in [1]:
                testname = testnameORG + str(i+1)
                print(testname, 'is started')
                bestDQNDFA, train_lossesA, dev_lossesA = stctrain_bert.start_trainDQ_NDF(
                    *dataDQA_NDF, 
                    **fixed_paramsDQ, scoretype='DQA', method=method,
                    Fnum=fnum, memory_rnn_type=mr, num_layers=num_layers,
                    evaluate=e,
                )

                bestDQNDFE, train_lossesE, dev_lossesE = stctrain_bert.start_trainDQ_NDF(
                    *dataDQE_NDF, 
                    **fixed_paramsDQ, scoretype='DQE',
                    Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                    evaluate=e,
                )

                bestDQNDFS, train_lossesS, dev_lossesS = stctrain_bert.start_trainDQ_NDF(
                    *dataDQS_NDF, 
                    **fixed_paramsDQ, scoretype='DQS', 
                    Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                    evaluate=e,
                )

                datahelper.pred_to_submission(testND, bestDQNDFA, bestDQNDFS, bestDQNDFE, test_turns, testIDs, filename='{}.json'.format(testname))

MeGCBERT1 is started
Start epoch 0
Start epoch 1
Start epoch 2
Start epoch 3
Start epoch 4
Start epoch 5
Start epoch 6
CNNCNN|7|True|True|2_2|1024|512_1024|0.08459|0.12972
models/DQDQA/DQsubtask-DQAscore.ckpt is saved

Start epoch 0
Start epoch 1
Start epoch 2
Start epoch 3
Start epoch 4
Start epoch 5
Start epoch 6
CNNCNN|7|True|True|2_2|1024|512_1024|0.08111|0.12309
models/DQDQE/DQsubtask-DQEscore.ckpt is saved

Start epoch 0
Start epoch 1
Start epoch 2
Start epoch 3
Start epoch 4
Start epoch 5
Start epoch 6
Start epoch 7
Start epoch 8
CNNCNN|9|True|True|2_2|1024|512_1024|0.07695|0.12355
models/DQDQS/DQsubtask-DQSscore.ckpt is saved

MeGCBERT2 is started
Start epoch 0
Start epoch 1


KeyboardInterrupt: 

In [None]:
# pickle.dump(bestDQANDFs, open('memoryDQANDFs.p', 'wb'))
# pickle.dump(bestDQSNDFs, open('memoryDQSNDFs.p', 'wb'))
# pickle.dump(bestDQENDFs, open('memoryDQENDFs.p', 'wb'))