In [1]:
import os
import time
import json
import pickle
import timeit
import random
import param
import shutil
import collections
import numpy as np
import tensorflow as tf

import stctrainCRF
import datahelperCRF
import stctokenizer
import nuggetdetectionCRF as ND
# import dialogquality as DQ
# import dialogquality_ndfeature as DQNDF
import stcevaluation as STCE

from scipy import stats
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
doclen = param.doclen
embsize = param.embsize
max_sent = param.max_sent
NDclasses = param.NDclasses
DQclasses = param.DQclasses
sentembsize = param.sentembsize

REMOVE_STOPWORDS = False
TO_LOWER = True
TOKEN_TYPE = 'nltk'
EMB = 'stc' # glove or stc

datahelper = datahelperCRF.DataHelper(embedding_path="../embedding/STCWiki/STCWiki_mincount0.model.bin")
stctokenizer = stctokenizer.STCTokenizer()

## Word Embedding

In [3]:
# corpus = datahelper.prepare_word_embedding_corpus(
#     '../data/text8', 
#     TOKEN_TYPE, 
#     REMOVE_STOPWORDS, 
#     TO_LOWER,
# )

# wordemb_model = Word2Vec(corpus, size=100, min_count=0, workers=4, iter=30, sg=1, window=5)
# word_vectors = wordemb_model.wv

In [4]:
# word_vectors.save("../embedding/STCWiki/STCWiki_mincount0.model.bin")
# datahelper.set_word_vectors(word_vectors)

## Read Corpus & Prepare data for model

In [5]:
# def get_data():
isBERT = False
trainX, _, trainND, trainDQ, train_turns, train_masks = datahelper.get_model_train_data(
    'train',
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=isBERT,
)

devX, _, devND, devDQ, dev_turns, dev_masks = datahelper.get_model_train_data(
    'dev',
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=isBERT,
)

testX, _, testND, testDQ, test_turns, test_masks = datahelper.get_model_train_data(
    'test',
    TOKEN_TYPE, 
    REMOVE_STOPWORDS, 
    TO_LOWER,
    EMB,
    bert=isBERT,
)

# testX, _, test_turns, test_masks = datahelper.get_model_test_data(
#     TOKEN_TYPE, 
#     REMOVE_STOPWORDS, 
#     TO_LOWER,
#     EMB,
#     bert=False,
# )

In [6]:
from collections import namedtuple
MultiAnsUtt = namedtuple("MultiAnsUtt", ['uttidx', 'secondans'])

def highest_label_idx(prob):
    highest = np.max(prob)
    return np.where(prob==highest)[0]

In [7]:
def add_second_ans(uttidx, second_ansidx, CRFX, CRFND, CRFTurns, CRFMasks, CRFDialogND, dialogX, dialogTurn, dialogMask):
    global max_sent
    CRF_label = np.asarray([0.] * max_sent)
    CRF_label[second_ansidx] = 1.
    CRFDialogND[uttidx] = CRF_label
    CRFX.append(dialogX)
    CRFND.append(CRFDialogND.copy())
    CRFTurns.append(dialogTurn)
    CRFMasks.append(dialogMask)

In [8]:
def convertCRF(X, ND, turns, masks):
    CRFX = []
    CRFND = []
    CRFTurns = []
    CRFMasks = []
    for dialogidx, (dialogX, dialogND, dialogTurn, dialogMask) in enumerate(zip(X, ND, turns, masks)):
        multi_ans_uttidx = []
        CRFDialogND = []

        for uttidx, (uttX, uttND, uttMask) in enumerate(zip(dialogX, dialogND, dialogMask)):
            ans_idx = highest_label_idx(uttND)
            num_of_ans = len(ans_idx)
            CRF_label = np.asarray([0.] * max_sent)
            if num_of_ans == 7:  # all zero
                CRFDialogND.append(CRF_label)
            elif num_of_ans == 1:  # one ans
                CRF_label[ans_idx[0]] = 1.
                CRFDialogND.append(CRF_label)
            elif num_of_ans == 2:  # two ans
                CRF_label[ans_idx[0]] = 1.
                CRFDialogND.append(CRF_label)
                multi_ans_uttidx.append(MultiAnsUtt(uttidx, ans_idx[1]))  # save the second ans and add later
            else:
                assert False, 'ND ans with more than 2'

        CRFX.append(dialogX)
        CRFND.append(CRFDialogND.copy())
        CRFTurns.append(dialogTurn)
        CRFMasks.append(dialogMask)

        if len(multi_ans_uttidx) == 1:
            uttidx, second_ansidx = multi_ans_uttidx[0]
            add_second_ans(uttidx, second_ansidx, CRFX, CRFND, CRFTurns, CRFMasks, CRFDialogND.copy(), dialogX, dialogTurn, dialogMask)

        elif len(multi_ans_uttidx) == 2:
            uttidx0, second_ansidx0 = multi_ans_uttidx[0]
            uttidx1, second_ansidx1 = multi_ans_uttidx[1]

            # 1st ans & 1st ans is already in the final list

            # 1st ans & 2nd ans
            add_second_ans(uttidx1, second_ansidx1, CRFX, CRFND, CRFTurns, CRFMasks, CRFDialogND.copy(), dialogX, dialogTurn, dialogMask)
            # 2nd ans & 1st ans
            add_second_ans(uttidx0, second_ansidx0, CRFX, CRFND, CRFTurns, CRFMasks, CRFDialogND, dialogX, dialogTurn, dialogMask)
            # 2nd ans & 2nd ans
            add_second_ans(uttidx1, second_ansidx1, CRFX, CRFND, CRFTurns, CRFMasks, CRFDialogND, dialogX, dialogTurn, dialogMask)
    
    assert len(CRFX) == len(CRFND) == len(CRFTurns) == len(CRFMasks)
    return CRFX, CRFND, CRFTurns, CRFMasks

def convertCRF_test(testND):
    idx_test_ND = []
    for i in range(len(testND)):
        idx_test_ND_dialog = []
        for j in range(max_sent):
            if np.all(testND[i][j] == 0):
                idx_test_ND_dialog.append(7)
            else:
                idx = highest_label_idx(testND[i][j])
                if len(idx) == 1:
                    idx_test_ND_dialog.append(idx[0])
                else:
                    idx_test_ND_dialog.append(idx)
        idx_test_ND.append(idx_test_ND_dialog.copy())
    return idx_test_ND

In [9]:
def convert_label_to_index(CRFND):
    for i in range(len(CRFND)):
        for j in range(max_sent):
            if np.all(CRFND[i][j] == 0):
                CRFND[i][j] = 7
            else:
                CRFND[i][j] = np.argmax(CRFND[i][j])

In [10]:
CRFtrainX, CRFtrainND, CRFtrain_turns, CRFtrain_masks = convertCRF(trainX, trainND, train_turns, train_masks)
CRFdevX, CRFdevND, CRFdev_turns, CRFdev_masks = convertCRF(devX, devND, dev_turns, dev_masks)

convert_label_to_index(CRFtrainND)
convert_label_to_index(CRFdevND)
CRFtestND = convertCRF_test(testND)

In [11]:
len(train_turns), sum(train_turns), len(dev_turns), sum(dev_turns)

(1337, 5601, 335, 1316)

In [12]:
len(CRFtrain_turns), sum(CRFtrain_turns), len(CRFdev_turns), sum(CRFdev_turns)

(1467, 6150, 368, 1461)

In [13]:
isinstance(CRFtestND[10][5], np.ndarray)

True

In [14]:
from pprint import pprint
print('Test on 1 dialogue with 2 ans in 1 utterance')
i = 14
CRFi = 14
# pprint(trainND[i])
# pprint(CRFtrainND[CRFi])
# pprint(CRFtrainND[CRFi+1])
assert(CRFtrainX[CRFi].all() == CRFtrainX[CRFi+1].all())
assert(CRFtrain_turns[CRFi] == CRFtrain_turns[CRFi+1])
assert(CRFtrain_masks[CRFi].all() == CRFtrain_masks[CRFi+1].all())
# print()
i = 15
CRFi = 16
# pprint(trainND[i])
# pprint(CRFtrainND[CRFi])
# pprint(CRFtrainND[CRFi+1])
# print()

print('Test on 1 dialogue with 2 ans in 2 utterances')
i = 101
# pprint(trainND[i])
CRFi = 110
# pprint(CRFtrainND[CRFi])
# pprint(CRFtrainND[CRFi+1])
# pprint(CRFtrainND[CRFi+2])
# pprint(CRFtrainND[CRFi+3])
assert(CRFtrainX[CRFi].all() == CRFtrainX[CRFi+1].all() == CRFtrainX[CRFi+2].all() == CRFtrainX[CRFi+3].all())
assert(CRFtrain_turns[CRFi] == CRFtrain_turns[CRFi+1] == CRFtrain_turns[CRFi+2] == CRFtrain_turns[CRFi+3])
assert(CRFtrain_masks[CRFi].all() == CRFtrain_masks[CRFi+1].all() == CRFtrain_masks[CRFi+2].all() == CRFtrain_masks[CRFi+3].all())

print('PASS')

Test on 1 dialogue with 2 ans in 1 utterance
Test on 1 dialogue with 2 ans in 2 utterances
PASS


In [15]:
testIDs = datahelper.testIDs
# trainDQA = [item['A'] for item in trainDQ]
# trainDQS = [item['S'] for item in trainDQ]
# trainDQE = [item['E'] for item in trainDQ]
# devDQA = [item['A'] for item in devDQ]
# devDQS = [item['S'] for item in devDQ]
# devDQE = [item['E'] for item in devDQ]

dataND = [CRFtrainX, CRFtrainND, CRFtrain_turns, CRFtrain_masks, CRFdevX, CRFdevND, CRFdev_turns, CRFdev_masks, testX, CRFtestND, test_turns, test_masks]
# dataDQA = [trainX, trainDQA, train_turns, devX, devDQA, dev_turns, testX, test_turns]
# dataDQS = [trainX, trainDQS, train_turns, devX, devDQS, dev_turns, testX, test_turns]
# dataDQE = [trainX, trainDQE, train_turns, devX, devDQE, dev_turns, testX, test_turns]

# dataDQA_NDF = [trainX, trainDQA, train_turns, trainND, devX, devDQA, dev_turns, devND, testX, test_turns]
# dataDQE_NDF = [trainX, trainDQE, train_turns, trainND, devX, devDQE, dev_turns, devND, testX, test_turns]
# dataDQS_NDF = [trainX, trainDQS, train_turns, trainND, devX, devDQS, dev_turns, devND, testX, test_turns]

In [16]:
from collections import Counter
counter = Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0})
for sent in CRFdevND:
    counter += Counter(sent)

In [17]:
len(CRFtrainX), len(trainX), len(CRFdevX), len(devX)

(1467, 1337, 368, 335)

In [20]:
np.asarray(CRFtrainX).shape

(1467, 7, 150, 100)

In [22]:
np.asarray(CRFtrainND).shape

(1467, 7)

In [None]:
es = 3
fixed_paramsND  = {
    'epoch':100, 
    'early_stopping':es, 
    'batch_size':30,
    'lr':5e-4,
    'kp':1, 
    'hiddens':1024, 
    'Fsize':[2,3], 
    'gating':False, 
    'bn':True, 
    'method':ND.CNNRNN,
} 

fixed_paramsDQ = {
    'epoch':100, 
    'early_stopping':es, 
    'batch_size':40, 
    'lr':5e-4, 
    'kp':1, 
    'hiddens':1024, # 1024 for gating, 2048 for no gating
    'Fsize':[2,2], # [2,2] for gating, [2,3] for no gating
    'gating':True, 
    'bn':True, 
}

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def show_train_history(title, train, valid, earlystop=es):
    epoch = len(train)
    best = epoch-earlystop
    x = [i for i in range(1, epoch + 1)]
    plt.figure(figsize=(5,12))
    ax = plt.figure().gca()
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.rcParams.update({'font.size': 18})
    plt.plot(x, train, marker='o', linestyle='-', color='b')
    plt.plot(x, valid, marker='o', linestyle='-', color='r')
    plt.axvline(best, color='black')
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Training Loss', 'Validation Loss'], loc='upper right')
    plt.show()

## Test ND

In [None]:
BEST_PATH = 'PickleResult/'
# bestND = pickle.load(open(BEST_PATH + 'bestND.p', "rb"))
bestDQAs = pickle.load(open(BEST_PATH + 'memoryDQAs.p', "rb"))
bestDQSs = pickle.load(open(BEST_PATH + 'memoryDQSs.p', "rb"))
bestDQEs = pickle.load(open(BEST_PATH + 'memoryDQEs.p', "rb"))

In [None]:
e = True
for mr in [None, 'Bi-GRU', 'Bi-LSTM']:
    for fn in [[256], [256,512], [256,512,1024]]:
        for num_layers in [1, 2, 3]:
#                 trainXp = trainX[:int(train_len/10*prop)]
#                 trainNDp = trainND[:int(train_len/10*prop)]
#                 train_turnsp = train_turns[:int(train_len/10*prop)]
#                 train_masksp = train_masks[:int(train_len/10*prop)]
#                 dataND = [trainXp, trainNDp, train_turnsp, train_masksp, devX, devND, dev_turns, dev_masks, testX, test_turns, test_masks]

            testname = 'ND_BERT_CRF_Memory{}_CNN{}_RNN{}'.format(mr, len(fn), num_layers)
            testND, train_losses, dev_losses = stctrainCRF.start_trainND(
                *dataND, 
                **fixed_paramsND,
                Fnum=fn, num_layers=num_layers, memory_rnn_type=mr,
                evaluate=e,
                bert=1,
            )

#             show_train_history(testname, train_losses, dev_losses)
#             datahelper.pred_to_submission(testND, bestDQAs[0], bestDQSs[0], bestDQEs[0], test_turns, testIDs, filename='{}.json'.format(testname))


In [None]:
# pickle.dump(testND, open('bestND190116.p', 'wb'))

In [None]:
stop

## Test DQ

In [None]:
memoryNDs = pickle.load(open('PickleResult/memoryNDs.p', 'rb'))

In [None]:
trainX, trainDQA, train_turns

In [None]:
e = True
method = DQ.CNNCNN
for l in [1]:
    for rm in ['Bi-LSTM']:
        for fn in [[512, 1024]]:
            testname = 'ND_trainsize_{}perc'.format(prop*10)
            print(testname, 'is started')
        
            bestDQA, train_lossesA, dev_lossesA = stctrain.start_trainDQ(
                *dataDQA, 
                **fixed_paramsDQ, scoretype='DQA', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )
            

            bestDQE, train_lossesE, dev_lossesE = stctrain.start_trainDQ(
                *dataDQE, 
                **fixed_paramsDQ, scoretype='DQE', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e,
            )

            bestDQS, train_lossesS, dev_lossesS = stctrain.start_trainDQ(
                *dataDQS, 
                **fixed_paramsDQ, scoretype='DQS', method=method,
                Fnum=fn, memory_rnn_type=rm, num_layers=l,
                evaluate=e
            )
       
            datahelper.pred_to_submission(memoryNDs[0], bestDQA, bestDQS, bestDQE, test_turns, testIDs, filename='{}.json'.format(testname))
            

In [None]:
# pickle.dump(bestDQAs, open('memoryDQAs.p', 'wb'))
# pickle.dump(bestDQSs, open('memoryDQSs.p', 'wb'))
# pickle.dump(bestDQEs, open('memoryDQEs.p', 'wb'))

## Test NDF

In [None]:
def submission_to_pred(path='ReTesting/0220_wordemb_test/NoneMemory_3stackCNN_2stackRNN(best).json'):
    import json
    with open(path) as f:
        test_preds_json = json.load(f)
        
    pred = []
    
    for testID in testIDs:
        for test_pred_json in test_preds_json:
            _id = test_pred_json['id']
            if _id != testID:
                continue
            dialogue_nuggets = test_pred_json['nugget']
            dialogue_pred = [] 
            
            for utterance_nugget in dialogue_nuggets:
                utterance_pred = [None] * 7
                if len(utterance_nugget.keys()) == 4:
                    utterance_pred[0] = utterance_nugget['CNUG*']
                    utterance_pred[1] = utterance_nugget['CNUG']
                    utterance_pred[2] = utterance_nugget['CNaN']
                    utterance_pred[3] = utterance_nugget['CNUG0']
                    utterance_pred[4] = 0.
                    utterance_pred[5] = 0.
                    utterance_pred[6] = 0.
                elif len(utterance_nugget.keys()) == 3:
                    utterance_pred[0] = 0.
                    utterance_pred[1] = 0.
                    utterance_pred[2] = 0.
                    utterance_pred[3] = 0.
                    utterance_pred[4] = utterance_nugget['HNUG*']
                    utterance_pred[5] = utterance_nugget['HNUG']
                    utterance_pred[6] = utterance_nugget['HNaN']
                
                dialogue_pred.append(utterance_pred)
                
            while len(dialogue_pred) < 7:
                dialogue_pred.append([0] * 7)
                
            pred.append(dialogue_pred)
    
    return pred

In [None]:
testND = submission_to_pred()

In [None]:
testNDmasked = [np.multiply(nd, mask) for nd, mask in zip(testND, test_masks)]

In [None]:
dataDQA_NDF += [testNDmasked]
dataDQE_NDF += [testNDmasked]
dataDQS_NDF += [testNDmasked]

In [None]:
len(dataDQA_NDF)

In [None]:
method = DQNDF.CNNCNN
e = True
testND = np.asarray(testND)

# for prop in range(1, 11):
for mr in ['Bi-LSTM']:
    for fnum in [[512, 1024]]:
        for num_layers in [1]:
            testname = 'test'
            print(testname, 'is started')

#             trainXp = trainX[:int(train_len/10*prop)]
#             train_turnsp = train_turns[:int(train_len/10*prop)]
#             trainNDp = trainND[:int(train_len/10*prop)]

#             trainDQAp = trainDQA[:int(train_len/10*prop)]      
#             dataDQA_NDF = [trainXp, trainDQAp, train_turnsp, trainNDp, devX, devDQA, dev_turns, devND, testX, test_turns, testNDmasked]
#             trainDQEp = trainDQE[:int(train_len/10*prop)]         
#             dataDQE_NDF = [trainXp, trainDQEp, train_turnsp, trainNDp, devX, devDQE, dev_turns, devND, testX, test_turns, testNDmasked]
#             trainDQSp = trainDQA[:int(train_len/10*prop)]              
#             dataDQS_NDF = [trainXp, trainDQSp, train_turnsp, trainNDp, devX, devDQS, dev_turns, devND, testX, test_turns, testNDmasked]


            bestDQNDFA, train_lossesA, dev_lossesA = stctrain.start_trainDQ_NDF(
                *dataDQA_NDF, 
                **fixed_paramsDQ, scoretype='DQA', method=method,
                Fnum=fnum, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            bestDQNDFE, train_lossesE, dev_lossesE = stctrain.start_trainDQ_NDF(
                *dataDQE_NDF, 
                **fixed_paramsDQ, scoretype='DQE',
                Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            bestDQNDFS, train_lossesS, dev_lossesS = stctrain.start_trainDQ_NDF(
                *dataDQS_NDF, 
                **fixed_paramsDQ, scoretype='DQS', 
                Fnum=fnum, method=method, memory_rnn_type=mr, num_layers=num_layers,
                evaluate=e, bert=False,
            )

            datahelper.pred_to_submission(testND, bestDQNDFA, bestDQNDFS, bestDQNDFE, test_turns, testIDs, filename='{}.json'.format(testname))

In [None]:
# pickle.dump(bestDQANDFs, open('memoryDQANDFs.p', 'wb'))
# pickle.dump(bestDQSNDFs, open('memoryDQSNDFs.p', 'wb'))
# pickle.dump(bestDQENDFs, open('memoryDQENDFs.p', 'wb'))