# DialEval-1 DNN practice (with evaluation function)
### Model : CNN, LSTM
##### input round, sender pred label
##### Edited by Weber Huang in 2020-05-28

### 1. Eval function and customizes loss function

In [77]:
from scipy import stats
import tensorflow as tf

def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
#     y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    # loss = (stats.entropy(y_pred, m, base=2) + stats.entropy(y_true, m, base=2)) / 2.
    # tf.keras.losses.KLD()
    loss = (tf.keras.losses.KLD(y_pred, m) + tf.keras.losses.KLD(y_true, m)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

def rnss_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(y_pred, y_true):
        return ((y_pred - y_true) ** 2).sum()

#     y_pred, y_true = normalize(y_pred, y_true)
    loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
    return loss
  
# model.compile(loss=custom_loss, optimizer='adam')

### 2. input dataset and modify

In [78]:
import os
import pandas as pd

os.chdir('C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1')
df = pd.read_excel(r'./200514_dev+train.xlsx')

In [79]:
tmp = []
for i in df['sender']:
    if i == 'customer':
        tmp.append(0)
    else:
        tmp.append(1)
df['sender_num'] = tmp
df.head()

Unnamed: 0,id,round,sender,texts,max_label,round_label,CNUG,CNUG*,CNUG0,CNaN,HNUG,HNUG*,HNaN,sender_num
0,4227729258237820,1,customer,内涵 段子 联通 皮 点赞 中国联通 中国联通 客服 掌上 营业厅 内涵 段子 话题 封 郑...,3,2,0.052632,0.0,0.157895,0.789474,0.0,0.0,0.0,0
1,4227729258237820,2,helpdesk,u,6,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105,1
2,4227729258237820,3,customer,夸夸,3,0,0.157895,0.0,0.0,0.842105,0.0,0.0,0.0,0
3,4227729258237820,4,helpdesk,*,6,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105,1
4,4121001149457180,1,customer,距离 反映 问题 已经 一个 星期 花粉 助手 D 荣耀 honor 荣耀 手机 华为 终端...,2,2,0.052632,0.0,0.789474,0.157895,0.0,0.0,0.0,0


In [80]:
df.shape

(17155, 14)

In [81]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import optimizers
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Dense, Embedding
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import concatenate
from keras import initializers
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM

dev = df[0:1754]
train = df[1755:]

X_train = train.filter(['round','sender_num','texts'])
X_test = dev.filter(['round','sender_num','texts'])
y_train = train.filter(['CNUG','CNUG*','CNUG0','CNaN','HNUG','HNUG*','HNaN'])
y_test = dev.filter(['CNUG','CNUG*','CNUG0','CNaN','HNUG','HNUG*','HNaN'])

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### 3. DNN 

In [82]:
def CNN(X_train, X_test, y_train, y_test, loss='categorical_crossentropy'):
    
    X1_train = X_train['texts']
#     X1_train = X_train[:,[-1]]
    X1_train = [str (item) for item in X1_train]
    
    X1_test = X_test['texts']
#     X1_test = X_test[:,[-1]]
    X1_test = [str (item) for item in X1_test]

    X2_train = X_train[['round','sender_num']].values
#     X2_train = X_train[:,0:7]
    X2_test = X_test[['round','sender_num']].values
#     X2_test = X_test[:,0:7]
    
    token = Tokenizer(num_words = 20000)
    token.fit_on_texts(X1_train)
    vocab = token.word_index
    print(token.document_count)

    x_train_seq = token.texts_to_sequences(X1_train)
    x_test_seq = token.texts_to_sequences(X1_test)
    X1_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

#     y_one_train = np_utils.to_categorical(y_train)
#     y_one_test = np_utils.to_categorical(y_test)
     
    num_labels = 7
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=7, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=32, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return train_history, pre_probability
    '''
    from sklearn import metrics
    print("Classification report for classifier:\n%s\n"
          % ( metrics.classification_report(y_test, predicted)))

    print("f1_score :\n%s\n" % ( metrics.f1_score(y_test, predicted, average='macro')))
    print("acc_score :\n%s\n" % ( metrics.accuracy_score(y_test, predicted)))
    '''

def lstm(X_train, X_test, y_train, y_test, loss='categorical_crossentropy'):
    X1_train = X_train['texts']
#     X1_train = X_train[:,[-1]]
    X1_train = [str (item) for item in X1_train]
    
    X1_test = X_test['texts']
#     X1_test = X_test[:,[-1]]
    X1_test = [str (item) for item in X1_test]

    X2_train = X_train[['round','sender_num']].values
#     X2_train = X_train[:,0:7]
    X2_test = X_test[['round','sender_num']].values
#     X2_test = X_test[:,0:7]
    
    token = Tokenizer(num_words = 20000)
    token.fit_on_texts(X1_train)
    print(token.document_count)

    x_train_seq = token.texts_to_sequences(X1_train)
    x_test_seq = token.texts_to_sequences(X1_test)
    X1_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

#     y_one_train = np_utils.to_categorical(y_train)
#     y_one_test = np_utils.to_categorical(y_test)
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=7,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=7, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=128, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return train_history, pre_probability
    '''
    from sklearn import metrics
    print("Classification report for classifier:\n%s\n"
          % ( metrics.classification_report(y_test, predicted)))

    print("f1_score :\n%s\n" % ( metrics.f1_score(y_test, predicted, average='macro')))
    print("acc_score :\n%s\n" % ( metrics.accuracy_score(y_test, predicted)))
    '''
    

In [83]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
'''
n_folds = 3
model_history = []
for i in tqdm(range(n_folds)):
    print("Training on Fold: ",i+1)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, 
                                                        random_state = np.random.randint(1,1000, 1)[0])
                                               
    
    model_history.append(CNN(X_train, X_test, y_train, y_test))
    print("======="*12, end="\n\n\n")
'''



In [84]:
# use categorical_crossentropy as loss function
train_history , pred = CNN(X_train, X_test, y_train, y_test)

15400
Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 150, 300)     6533100     input_33[0][0]                   
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 150, 2)       1202        embedding_17[0][0]               
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 2)            0                                            
_____________________________________________________________________________________

In [85]:
# use categorical_crossentropy as loss function
train_history_ls , pred_ls = lstm(X_train, X_test, y_train, y_test)

15400
Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 150, 32)      640000      input_35[0][0]                   
__________________________________________________________________________________________________
dropout_36 (Dropout)            (None, 150, 32)      0           embedding_18[0][0]               
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 16)           3136        dropout_36[0][0]                 
_____________________________________________________________________________________

In [86]:
# use jsd as loss function
train_history_jsd , pred_jsd = CNN(X_train, X_test, y_train, y_test, loss = jsd_custom_loss)

15400
Model: "model_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 150, 300)     6533100     input_37[0][0]                   
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 150, 2)       1202        embedding_19[0][0]               
__________________________________________________________________________________________________
input_38 (InputLayer)           (None, 2)            0                                            
_____________________________________________________________________________________

In [87]:
# use jsd as loss function
train_history_ls_jsd , pred_ls_jsd = lstm(X_train, X_test, y_train, y_test, loss = jsd_custom_loss)

15400
Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_39 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 150, 32)      640000      input_39[0][0]                   
__________________________________________________________________________________________________
dropout_41 (Dropout)            (None, 150, 32)      0           embedding_20[0][0]               
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 16)           3136        dropout_41[0][0]                 
_____________________________________________________________________________________

In [88]:
# use rnss as loss function
# train_history_rnss , pred_rnss = CNN(X_train, X_test, y_train, y_test, loss = rnss_custom_loss)

15400


AttributeError: 'Tensor' object has no attribute 'sum'

In [None]:
# use rnss as loss function
# train_history_ls_rnss , pred_ls_rnss = lstm(X_train, X_test, y_train, y_test, loss = rnss_custom_loss)

In [92]:
true = np.array(y_test)
def calculate_divergence(true, pred):
    tmp = 0
    for i in range(len(true)):
        tmp = tmp + jensen_shannon_div(pred[i], true[i])
    
    print('mean jsd :', tmp/len(true))
    
    tmp = 0
    for i in range(len(true)):
        tmp = tmp + root_normalized_squared_error(pred[i], true[i])
    
    print('mean rnss :', tmp/len(true))
    
#         print('---sentence{0}---'.format(i))
#         print('jsd :', jensen_shannon_div(pred[i], true[i]))
#         print('rnss :', root_normalized_squared_error(pred[i], true[i]))

# print('--- textCNN ---', '\n')
# calculate_divergence(true, pred)

# print('--- LSTM ---', '\n')
# calculate_divergence(true, pred_ls)

In [95]:
print('=== categorical_crossentropy ===', '\n')
print('---textCNN---')
calculate_divergence(true, pred)
print('---LSTM---')
calculate_divergence(true, pred_ls)
print('\n', '=== loss_jsd ===', '\n')
print('---textCNN---')
calculate_divergence(true, pred_jsd)
print('---LSTM---')
calculate_divergence(true, pred_ls_jsd)

=== categorical_crossentropy === 

---textCNN---
mean jsd : 0.05153927563537783
mean rnss : 0.14659849085911045
---LSTM---
mean jsd : 0.04705732779576456
mean rnss : 0.13304897800666488

 === loss_jsd === 

---textCNN---
mean jsd : 0.05118964231857076
mean rnss : 0.14899200261727294
---LSTM---
mean jsd : 0.05362496832814412
mean rnss : 0.14883823918886524
