# NTCIR-15 DialEval model note
###### Created by Weber Huang 2020-06-15

## Table of content
#### 1. JSON to dataframe
#### 2. Dataset cleaning
#### 3. Segmentation
#### 4. Generate the output of preprocessing
#### 5. Modeling
#### 6. Evaluation
#### 7. Generate the submission estimation JSON

> How to use this note?
     
+ In this ipython file, the model will be train autometically. Feel free to adjust the parameter of those models If you want. I'd suggest you to directly switch to section 5.       
+ If you want to test the evaluation, here is the way:       
    1. Run the cell to section 4 first, use **generate_dataset(name, wd, stop_word_path)** to process raw json to dataframe for both train and dev.
    2. Run section 5 and 6.  
    3. In section 6, there is a function **Generate_submission(dev, model_1, model_2)**, plz make sure that both 2 model are same, like both are cnn or lst.   
    4. Save the submission copy and test it with ground_truth through eval.py     
    

In [13]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
from sklearn import preprocessing
import jieba
import re

### 1.JSON to dataframe

In [14]:
def get_nugget(dataframe):   
    target = {'nugget'}
    dicts = []
    for item in tqdm(dataframe['annotations']):
        sub_dicts = []
        for element in item:
            sub_dicts.append({key:value for key,value in element.items() if key in target}['nugget'])
        dicts.append(sub_dicts)

    dataframe['nuggets'] = dicts # dis is for anno nugget list
    
    return dataframe

def shaping(dataframe):
    length = []
    for i in tqdm(dataframe['turns']):
        length.append(len(i))
    Id = dataframe['id'].tolist()

    Fin_Id = sum([[s] * n for s, n in zip(Id, length)], [])

    turns_list = dataframe['turns'].tolist()
    
    Fin_turns_anno = []
    for x,y in tqdm(zip(turns_list,dataframe['nuggets'])):
        for q in range(len(x)):
            Fin_turns_anno.append(list(x[q].values())+[i[q] for i in y])
    
    return Fin_Id, Fin_turns_anno

def stacking(Fin_Id, Fin_turns_anno):    
    train_clean = pd.DataFrame({'id': Fin_Id,'info': Fin_turns_anno})
    # train_clean.head()
    train_df = pd.DataFrame(train_clean['info'].values.tolist(), columns=['sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19'])
    train_df['id'] = train_clean['id']
    train_df = train_df[['id','sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19']]
    
    return train_df

### 2.Dataset cleaning

In [15]:
def process_data(dataframe):
    
    # id to str
    dataframe['id'] = dataframe['id'].apply(str)
    
    
    # round
    uni = dataframe.id.unique()
    num = []
    for i in uni:
        count = 1
        for j in dataframe['id']:
            if i == j:
                num.append(count)
                count += 1
            else:
                continue

    dataframe['round'] = num
    
    
    # distribution
    Nugget_types = ['CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']
    arr = np.array(dataframe.iloc[:,3:22]) 
    dicts = []
    tmp = []
    
    for i in arr:
        c = Counter(i)
        dicts.append(c)
        
    for i in dicts:
        test = []
        for n in Nugget_types:
            test.append(i.get(n,0)/19)
        tmp.append(test)
        
    tmp = np.array(tmp)
    for i in range(len(Nugget_types)):
        dataframe[Nugget_types[i]] = tmp[:,i]
        
        
    # round_max (round_label)
    f = dataframe.groupby('round').sum()
    out = list(f.idxmax(axis=1))

    round_max = []
    for i in dataframe['round']:
        for j in range(1,8):
            if i == j:
                round_max.append(out[j-1])
            else:
                continue
    dataframe['round_max'] = round_max
    
    
    # label encoding (round_label)
    
    le = preprocessing.LabelEncoder()
    le.fit(Nugget_types);
    round_label = le.transform(list(dataframe['round_max']))
    dataframe['round_label'] = round_label
    
    
    # label encoding (sender_num)
    sender = ['customer','helpdesk']
    l = preprocessing.LabelEncoder()
    l.fit(sender);
    sender_num = l.transform(list(dataframe['sender']))
    dataframe['sender_num'] = sender_num
    
    subset = dataframe[['id','sender','sender_num','utterance','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 3.Segmentation

In [16]:
def segment(dataframe, file_path):
    
    texts = dataframe['utterance'].astype(str)
    
    seg_texts = []
    for line in texts:
        seg_content = ' '.join(jieba.cut(line, cut_all = False))
        seg_texts.append(seg_content)
        
    def remove_punctuation(line):
        rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
        line = rule.sub(' ',line)
        return line
    
    texts = []
    for line in seg_texts:
        new_line = remove_punctuation(line).split()
        texts.append(new_line)
        
    cn_stopwords = []
    with open(file_path, 'r', encoding='UTF-8') as file:
        for data in file.read().splitlines():
            cn_stopwords.append(data)
            
    # remove punctuation
    pp_texts = []
    for line in texts:
        line_noSW = []
        for word in line:
            if word not in cn_stopwords:
                line_noSW.append(word)
        pp_texts.append(line_noSW)
    
    # change emoji in pp_texts to *
    for line in pp_texts:
        if line == []:
            line.append("*")
            
    # concatenate the sentences by whitespace
    new_texts = []
    for sentence in pp_texts:
        series_sentence = " ".join(word for word in sentence)
        new_texts.append(series_sentence)
    
    dataframe['texts'] = new_texts
    
    subset = dataframe[['id','sender','sender_num','texts','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 4.Generate the output of preprocessing

In [17]:
def generate_dataset(name, wd, stop_word_path):
    os.chdir(wd)
    file = pd.read_json(name, encoding='utf8')
    nu = get_nugget(file)
    Id, anno = shaping(nu)
    output = stacking(Id, anno)
    fin = process_data(output)
    seg = segment(fin, stop_word_path)
    
    return seg

###### Plz feed the raw_json, working directory and stop_word file in the generate_dataset()

In [18]:
# save file
# import time
# path = 'C:/Users/doudi/OneDrive/Documents/stc3-dataset/data/'
# timestr = time.strftime("%Y%m%d%H%M")
# output.to_csv((path + timestr + '_train_data_cn.csv'), index=False, encoding='utf_8_sig')

In [19]:
train = generate_dataset(r'train_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
dev = generate_dataset(r'dev_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


> See if the file are same 

In [21]:
# count = 0
# if output.equals(output_15_tr) == True:
#     print("DialEval-14 training data is as same as DialEval-15 training data")
# else:
#     print("DialEval-14 training data is not as same as DialEval-15 training data")

    
# output.iloc[0] == output_15_tr.iloc[0]

# # print("There are {0} rows in 2 datasets as same".format(count))
# # print("\n")
# # print("There are {0} rows in 2 datasets as different".format((len(output)-count)))

### 5. Modeling

#### 5.1 Write some necessory def function 

In [22]:
from scipy import stats
import tensorflow as tf

def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
#     y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    # loss = (stats.entropy(y_pred, m, base=2) + stats.entropy(y_true, m, base=2)) / 2.
    # tf.keras.losses.KLD()
    loss = (tf.keras.losses.KLD(y_pred, m) + tf.keras.losses.KLD(y_true, m)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

# def rnss_custom_loss(y_true, y_pred):
            
#     # calculate loss, using y_pred
#     """ RNSS: Root Normalised Sum of Squares
#     """

#     def squared_error(y_pred, y_true):
#         return ((y_pred - y_true) ** 2).sum()

# #     y_pred, y_true = normalize(y_pred, y_true)
#     loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
#     return loss
  

In [23]:
# split from sender
train_c = train[train.sender=='customer']
train_h = train[train.sender=='helpdesk']
dev_c = dev[dev.sender=='customer']
dev_h = dev[dev.sender=='helpdesk']

In [24]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import optimizers
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Dense, Embedding
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import concatenate
from keras import initializers
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM

#### 5.2 Features preparation

In [25]:
# === customer
c_X_train = train_c.filter(['round','round_label','texts'])
c_X_test = dev_c.filter(['round','round_label','texts'])
    

y_train_c = train_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])
y_test_c = dev_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])

# y_train_h = train.filter(['HNUG','HNUG*','HNaN'])
# y_test_h = dev.filter(['HNUG','HNUG*','HNaN'])

c_X1_train = c_X_train['texts']
# c_X1_train = [str (item) for item in c_X1_train]
c_X1_test = c_X_test['texts']
# c_X1_test = [str (item) for item in c_X1_test]

c_X2_train = c_X_train[['round','round_label']].values
c_X2_test = c_X_test[['round','round_label']].values

c_token = Tokenizer(num_words = 20000)
c_token.fit_on_texts(c_X1_train)
c_vocab = c_token.word_index
print(c_token.document_count)

c_x_train_seq = c_token.texts_to_sequences(c_X1_train)
c_x_test_seq = c_token.texts_to_sequences(c_X1_test)
c_X1_train = sequence.pad_sequences(c_x_train_seq, maxlen = 150)
c_X1_test = sequence.pad_sequences(c_x_test_seq, maxlen = 150)

8500


In [26]:
# === helpdesk
h_X_train = train_h.filter(['round','round_label','texts'])
h_X_test = dev_h.filter(['round','round_label','texts'])
    

# y_train_c = train_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])
# y_test_c = dev_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])

y_train_h = train_h.filter(['HNUG','HNUG*','HNaN'])
y_test_h = dev_h.filter(['HNUG','HNUG*','HNaN'])

h_X1_train = h_X_train['texts']
# h_X1_train = [str (item) for item in h_X1_train]
h_X1_test = h_X_test['texts']
# h_X1_test = [str (item) for item in h_X1_test]

h_X2_train = h_X_train[['round','round_label']].values
h_X2_test = h_X_test[['round','round_label']].values

h_token = Tokenizer(num_words = 20000)
h_token.fit_on_texts(h_X1_train)
h_vocab = h_token.word_index
print(h_token.document_count)

h_x_train_seq = h_token.texts_to_sequences(h_X1_train)
h_x_test_seq = h_token.texts_to_sequences(h_X1_test)
h_X1_train = sequence.pad_sequences(h_x_train_seq, maxlen = 150)
h_X1_test = sequence.pad_sequences(h_x_test_seq, maxlen = 150)

6900


#### 5.3 textCNN for customer and helpdesk respectively

In [27]:
# === customer
def CNN_C(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):   
    
    num_labels = 4
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(c_vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=4, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

In [28]:
# === helpdesk
def CNN_H(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
    
    num_labels = 2
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(h_vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=3, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

#### 5.4 LSTM for customer and helpdesk respectively

In [44]:
# === customer
def lstm_C(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=4,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=4, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

In [45]:
# === helpdesk
def lstm_H(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=3,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=3, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

### 6. Evaluation

#### 6.1 Subsetting training

In [31]:
CNN_c_model, CNN_c_history, CNN_c_pred = CNN_C(c_X1_train, c_X2_train, c_X1_test, 
                                               c_X2_test, y_train_c, y_test_c, 
                                               loss = jsd_custom_loss)


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     5866800     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 150, 2)       1202        embedding_1[0][0]                
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 2)            0                                            
___________________________________________________________________________________________

Epoch 46/100
 - 1s - loss: 0.0230 - accuracy: 0.8913 - val_loss: 0.0375 - val_accuracy: 0.8706
Epoch 47/100
 - 1s - loss: 0.0232 - accuracy: 0.8903 - val_loss: 0.0377 - val_accuracy: 0.8729
Epoch 48/100
 - 1s - loss: 0.0235 - accuracy: 0.8847 - val_loss: 0.0378 - val_accuracy: 0.8700
Epoch 49/100
 - 1s - loss: 0.0231 - accuracy: 0.8869 - val_loss: 0.0378 - val_accuracy: 0.8688
Epoch 50/100
 - 1s - loss: 0.0226 - accuracy: 0.8915 - val_loss: 0.0382 - val_accuracy: 0.8671
Epoch 51/100
 - 1s - loss: 0.0229 - accuracy: 0.8862 - val_loss: 0.0381 - val_accuracy: 0.8676
Epoch 52/100
 - 1s - loss: 0.0228 - accuracy: 0.8907 - val_loss: 0.0382 - val_accuracy: 0.8659
Epoch 53/100
 - 1s - loss: 0.0227 - accuracy: 0.8897 - val_loss: 0.0380 - val_accuracy: 0.8688
Epoch 54/100
 - 1s - loss: 0.0229 - accuracy: 0.8890 - val_loss: 0.0384 - val_accuracy: 0.8653
Epoch 55/100
 - 1s - loss: 0.0227 - accuracy: 0.8866 - val_loss: 0.0378 - val_accuracy: 0.8694
Epoch 56/100
 - 1s - loss: 0.0224 - accuracy: 0.88

In [32]:
CNN_h_model, CNN_h_history, CNN_h_pred = CNN_H(h_X1_train, h_X2_train, h_X1_test, 
                                               h_X2_test, y_train_h, y_test_h, 
                                               loss = jsd_custom_loss)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     1982100     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 150, 2)       1202        embedding_2[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 2)            0                                            
____________________________________________________________________________________________

Epoch 54/100
 - 1s - loss: 0.0183 - accuracy: 0.8663 - val_loss: 0.0275 - val_accuracy: 0.8572
Epoch 55/100
 - 1s - loss: 0.0181 - accuracy: 0.8730 - val_loss: 0.0277 - val_accuracy: 0.8522
Epoch 56/100
 - 1s - loss: 0.0174 - accuracy: 0.8774 - val_loss: 0.0277 - val_accuracy: 0.8638
Epoch 57/100
 - 1s - loss: 0.0176 - accuracy: 0.8801 - val_loss: 0.0278 - val_accuracy: 0.8543
Epoch 58/100
 - 1s - loss: 0.0185 - accuracy: 0.8737 - val_loss: 0.0278 - val_accuracy: 0.8449
Epoch 59/100
 - 1s - loss: 0.0180 - accuracy: 0.8726 - val_loss: 0.0281 - val_accuracy: 0.8435
Epoch 60/100
 - 1s - loss: 0.0180 - accuracy: 0.8717 - val_loss: 0.0280 - val_accuracy: 0.8536
Epoch 61/100
 - 1s - loss: 0.0176 - accuracy: 0.8761 - val_loss: 0.0282 - val_accuracy: 0.8543
Epoch 62/100
 - 1s - loss: 0.0179 - accuracy: 0.8759 - val_loss: 0.0282 - val_accuracy: 0.8565
Epoch 63/100
 - 1s - loss: 0.0176 - accuracy: 0.8750 - val_loss: 0.0285 - val_accuracy: 0.8529
Epoch 64/100
 - 1s - loss: 0.0177 - accuracy: 0.87

In [46]:
lstm_c_model, lstm_c_history, lstm_c_pred = lstm_C(c_X1_train, c_X2_train, c_X1_test, 
                                               c_X2_test, y_train_c, y_test_c, 
                                               loss = jsd_custom_loss)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 150, 32)      640000      input_7[0][0]                    
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 150, 32)      0           embedding_4[0][0]                
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 16)           3136        dropout_8[0][0]                  
____________________________________________________________________________________________

Epoch 49/100
 - 13s - loss: 0.0122 - accuracy: 0.9426 - val_loss: 0.0309 - val_accuracy: 0.8782
Epoch 50/100
 - 13s - loss: 0.0121 - accuracy: 0.9459 - val_loss: 0.0311 - val_accuracy: 0.8747
Epoch 51/100
 - 14s - loss: 0.0121 - accuracy: 0.9440 - val_loss: 0.0308 - val_accuracy: 0.8776
Epoch 52/100
 - 14s - loss: 0.0120 - accuracy: 0.9421 - val_loss: 0.0309 - val_accuracy: 0.8765
Epoch 53/100
 - 14s - loss: 0.0119 - accuracy: 0.9419 - val_loss: 0.0315 - val_accuracy: 0.8712
Epoch 54/100
 - 14s - loss: 0.0119 - accuracy: 0.9460 - val_loss: 0.0321 - val_accuracy: 0.8776
Epoch 55/100
 - 14s - loss: 0.0119 - accuracy: 0.9468 - val_loss: 0.0313 - val_accuracy: 0.8847
Epoch 56/100
 - 14s - loss: 0.0115 - accuracy: 0.9415 - val_loss: 0.0308 - val_accuracy: 0.8700
Epoch 57/100
 - 14s - loss: 0.0115 - accuracy: 0.9449 - val_loss: 0.0311 - val_accuracy: 0.8718
Epoch 58/100
 - 14s - loss: 0.0113 - accuracy: 0.9469 - val_loss: 0.0311 - val_accuracy: 0.8806
Epoch 59/100
 - 14s - loss: 0.0111 - acc

In [47]:
lstm_h_model, lstm_h_history, lstm_h_pred = lstm_H(h_X1_train, h_X2_train, h_X1_test, 
                                               h_X2_test, y_train_h, y_test_h,  
                                               loss = jsd_custom_loss)

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 150, 32)      640000      input_9[0][0]                    
__________________________________________________________________________________________________
dropout_11 (Dropout)            (None, 150, 32)      0           embedding_5[0][0]                
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 16)           3136        dropout_11[0][0]                 
____________________________________________________________________________________________

Epoch 49/100
 - 11s - loss: 0.0076 - accuracy: 0.9288 - val_loss: 0.0249 - val_accuracy: 0.8659
Epoch 50/100
 - 11s - loss: 0.0076 - accuracy: 0.9254 - val_loss: 0.0256 - val_accuracy: 0.8522
Epoch 51/100
 - 11s - loss: 0.0075 - accuracy: 0.9299 - val_loss: 0.0250 - val_accuracy: 0.8587
Epoch 52/100
 - 11s - loss: 0.0074 - accuracy: 0.9283 - val_loss: 0.0253 - val_accuracy: 0.8703
Epoch 53/100
 - 11s - loss: 0.0074 - accuracy: 0.9317 - val_loss: 0.0257 - val_accuracy: 0.8594
Epoch 54/100
 - 12s - loss: 0.0074 - accuracy: 0.9312 - val_loss: 0.0255 - val_accuracy: 0.8616
Epoch 55/100
 - 12s - loss: 0.0073 - accuracy: 0.9337 - val_loss: 0.0259 - val_accuracy: 0.8493
Epoch 56/100
 - 12s - loss: 0.0073 - accuracy: 0.9293 - val_loss: 0.0254 - val_accuracy: 0.8609
Epoch 57/100
 - 12s - loss: 0.0071 - accuracy: 0.9304 - val_loss: 0.0260 - val_accuracy: 0.8457
Epoch 58/100
 - 12s - loss: 0.0072 - accuracy: 0.9322 - val_loss: 0.0257 - val_accuracy: 0.8609
Epoch 59/100
 - 12s - loss: 0.0070 - acc

#### 6.2 Output by rbind

In [35]:
# predict single row
def padding_single_c(dev):
    X_test = dev.filter(['round','round_label','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round','round_label']].values
    
    x_test_seq = c_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)
    
    return X1_test, X2_test

def padding_single_h(dev):
    X_test = dev.filter(['round','round_label','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round','round_label']].values
    
    x_test_seq = h_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)
    
    return X1_test, X2_test

In [36]:
from itertools import chain

# input the development dataframe and the method
# for current models (loss = jsd):
# model_1 = customer model (CNN_c_model, lstm_c_model)
# model_2 = helpdesk model (CNN_h_model, lstm_h_model)
def Generate_submission(dev, model_1, model_2):
    Id_list = dev['id'].unique()
    C_nugget = ['CNUG','CNUG*','CNUG0','CNaN']
    H_nugget = ['HNUG','HNUG*','HNaN']

    final = []
    
    # go through each Id first
    for Id in tqdm(Id_list):  
        result = []
        
        for i in range(len(dev)):
            
            # if Id is match than predict the prob_distribution and zip it as dictionary 
            if dev['id'][i] == Id:
                if dev.iloc[i, 1] == 'customer':
                    t1, t2 = padding_single_c(dev.iloc[i])
                    t2 = np.array(t2).reshape(1,2)
                    cus_prob = model_1.predict(x=[t1, t2])
                    cus_prob = cus_prob.tolist()
                    cus_prob = list(chain(*cus_prob))
                    dict_c = dict(zip(C_nugget, cus_prob))
                    result.append(dict_c)
                else:
                    t3, t4 = padding_single_h(dev.iloc[i])
                    t4 = np.array(t4).reshape(1,2)
                    help_prob = model_2.predict(x=[t3, t4])
                    help_prob = help_prob.tolist()
                    help_prob = list(chain(*help_prob))
                    dict_h = dict(zip(H_nugget, help_prob))
                    result.append(dict_h)
            # if Id isn't match than continue until it match or switch to new Id
            else:
                continue
        
        # Submission form
        dict1 = {'nugget':result,'id':Id}
        final.append(dict1)
        
    return final

In [48]:
final = Generate_submission(dev, lstm_c_model, lstm_h_model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




In [53]:
final

[{'nugget': [{'CNUG': 0.000470252736704424,
    'CNUG*': 2.8830916107835947e-06,
    'CNUG0': 0.8798546195030212,
    'CNaN': 0.1196722462773323},
   {'HNUG': 0.31223055720329285,
    'HNUG*': 0.0029986384324729443,
    'HNaN': 0.6847708225250244},
   {'CNUG': 0.47035953402519226,
    'CNUG*': 0.03247697651386261,
    'CNUG0': 0.0010873944265767932,
    'CNaN': 0.4960760772228241},
   {'HNUG': 0.2505452334880829,
    'HNUG*': 0.003138170577585697,
    'HNaN': 0.7463165521621704}],
  'id': '4227729258237823'},
 {'nugget': [{'CNUG': 0.025972850620746613,
    'CNUG*': 0.00023109630274120718,
    'CNUG0': 0.8228551745414734,
    'CNaN': 0.1509409099817276},
   {'HNUG': 0.3585042953491211,
    'HNUG*': 0.5592086315155029,
    'HNaN': 0.08228705078363419},
   {'CNUG': 0.6785674691200256,
    'CNUG*': 0.070972740650177,
    'CNUG0': 0.003589578904211521,
    'CNaN': 0.2468702495098114},
   {'HNUG': 0.4449242055416107,
    'HNUG*': 0.05284923315048218,
    'HNaN': 0.5022265911102295},
   {'CNU

### 7. Generate the submission estimation JSON

In [8]:
import json
import time
import os

path = 'C:/Users/doudi/OneDrive/Documents/ntcir15/eval'
os.chdir(path)
timestr = time.strftime("%Y%m%d%H%M")

In [51]:
with open((timestr + '_' + 'dev_eval.json'), 'w', encoding='utf-8') as f: 
    f.write(json.dumps(final, ensure_ascii=False, indent=2))

In [9]:
!python eval.py 202006161922_dev_eval.json dev_cn.json

{'quality': None, 'nugget': {'jsd': 4.5435131098782255, 'rnss': 2.9066604872291197}}


In [12]:
print('=== 大會的最佳解果 ===')
print('jsd :', 2**(-5.526337576519978))
print('rnss :', 2**(-3.4904552364933963))

print('\n')
print('=== 我們模型結果 ===')
print('jsd :', 2**(-4.5435131098782255))
print('rnss :', 2**(-2.9066604872291197))

=== 大會的最佳解果 ===
jsd : 0.02169734643558826
rnss : 0.08897505708102076


=== 我們模型結果 ===
jsd : 0.04288113463140385
rnss : 0.13335460155933881


>> below is the debugging zone.... ignore them plz zzz