# DialEval-1 DNN practice (with evaluation function)
### Model : CNN, LSTM
##### input round, sender pred label
##### Edited by Weber Huang in 2020-05-28

### 1. Eval function and customizes loss function

In [None]:
from scipy import stats
def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
    y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    loss = (stats.entropy(y_pred, m, base=base) + stats.entropy(y_true, m, base=base)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

def rnss_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(y_pred, y_true):
        return ((y_pred - y_true) ** 2).sum()

    y_pred, y_true = normalize(y_pred, y_true)
    loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
    return loss
  
# model.compile(loss=custom_loss, optimizer='adam')

### 2. input dataset and modify

In [1]:
import os
import pandas as pd

os.chdir('C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1')
df = pd.read_excel(r'./200514_dev+train.xlsx')

In [2]:
tmp = []
for i in df['sender']:
    if i == 'customer':
        tmp.append(0)
    else:
        tmp.append(1)
df['sender_num'] = tmp
df.head()

Unnamed: 0,id,round,sender,texts,max_label,round_label,CNUG,CNUG*,CNUG0,CNaN,HNUG,HNUG*,HNaN,sender_num
0,4227729258237820,1,customer,内涵 段子 联通 皮 点赞 中国联通 中国联通 客服 掌上 营业厅 内涵 段子 话题 封 郑...,3,2,0.052632,0.0,0.157895,0.789474,0.0,0.0,0.0,0
1,4227729258237820,2,helpdesk,u,6,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105,1
2,4227729258237820,3,customer,夸夸,3,0,0.157895,0.0,0.0,0.842105,0.0,0.0,0.0,0
3,4227729258237820,4,helpdesk,*,6,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105,1
4,4121001149457180,1,customer,距离 反映 问题 已经 一个 星期 花粉 助手 D 荣耀 honor 荣耀 手机 华为 终端...,2,2,0.052632,0.0,0.789474,0.157895,0.0,0.0,0.0,0


In [5]:
df.shape

(17155, 14)

In [6]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import optimizers
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Dense, Embedding
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import concatenate
from keras import initializers
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM

dev = df[0:1754]
train = df[1755:]

X_train = train.filter(['round','sender_num','texts'])
X_test = dev.filter(['round','sender_num','texts'])
y_train = train.filter(['CNUG','CNUG*','CNUG0','CNaN','HNUG','HNUG*','HNaN'])
y_test = dev.filter(['CNUG','CNUG*','CNUG0','CNaN','HNUG','HNUG*','HNaN'])

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### 3. DNN 

In [29]:
def CNN(X_train, X_test, y_train, y_test, loss='categorical_crossentropy'):
    
    X1_train = X_train['texts']
#     X1_train = X_train[:,[-1]]
    X1_train = [str (item) for item in X1_train]
    
    X1_test = X_test['texts']
#     X1_test = X_test[:,[-1]]
    X1_test = [str (item) for item in X1_test]

    X2_train = X_train[['round','sender_num']].values
#     X2_train = X_train[:,0:7]
    X2_test = X_test[['round','sender_num']].values
#     X2_test = X_test[:,0:7]
    
    token = Tokenizer(num_words = 20000)
    token.fit_on_texts(X1_train)
    vocab = token.word_index
    print(token.document_count)

    x_train_seq = token.texts_to_sequences(X1_train)
    x_test_seq = token.texts_to_sequences(X1_test)
    X1_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

#     y_one_train = np_utils.to_categorical(y_train)
#     y_one_test = np_utils.to_categorical(y_test)
     
    num_labels = 7
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=7, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=32, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return train_history, pre_probability
    '''
    from sklearn import metrics
    print("Classification report for classifier:\n%s\n"
          % ( metrics.classification_report(y_test, predicted)))

    print("f1_score :\n%s\n" % ( metrics.f1_score(y_test, predicted, average='macro')))
    print("acc_score :\n%s\n" % ( metrics.accuracy_score(y_test, predicted)))
    '''

def lstm(X_train, X_test, y_train, y_test):
    X1_train = X_train['texts']
#     X1_train = X_train[:,[-1]]
    X1_train = [str (item) for item in X1_train]
    
    X1_test = X_test['texts']
#     X1_test = X_test[:,[-1]]
    X1_test = [str (item) for item in X1_test]

    X2_train = X_train[['round','sender_num']].values
#     X2_train = X_train[:,0:7]
    X2_test = X_test[['round','sender_num']].values
#     X2_test = X_test[:,0:7]
    
    token = Tokenizer(num_words = 20000)
    token.fit_on_texts(X1_train)
    print(token.document_count)

    x_train_seq = token.texts_to_sequences(X1_train)
    x_test_seq = token.texts_to_sequences(X1_test)
    X1_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

#     y_one_train = np_utils.to_categorical(y_train)
#     y_one_test = np_utils.to_categorical(y_test)
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=7,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=7, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=128, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return train_history, pre_probability
    '''
    from sklearn import metrics
    print("Classification report for classifier:\n%s\n"
          % ( metrics.classification_report(y_test, predicted)))

    print("f1_score :\n%s\n" % ( metrics.f1_score(y_test, predicted, average='macro')))
    print("acc_score :\n%s\n" % ( metrics.accuracy_score(y_test, predicted)))
    '''
    

In [30]:
import numpy as np
from tqdm import tqdm_notebook as tqdm


train_history , pre_probability = CNN(X_train, X_test, y_train, y_test)

'''
n_folds = 3
model_history = []
for i in tqdm(range(n_folds)):
    print("Training on Fold: ",i+1)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, 
                                                        random_state = np.random.randint(1,1000, 1)[0])
                                               
    
    model_history.append(CNN(X_train, X_test, y_train, y_test))
    print("======="*12, end="\n\n\n")
'''


15400
Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 150, 300)     6533100     input_15[0][0]                   
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 150, 2)       1202        embedding_8[0][0]                
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 2)            0                                            
______________________________________________________________________________________



In [56]:
o = np.array(y_test)
for i in range(len(o)):
    print('---sentence{0}---'.format(i))
    print('jsd :', jensen_shannon_div(pre_probability[i], o[i]))
    print('rnss :', root_normalized_squared_error(pre_probability[i], o[i]))

---sentence0---
jsd : 0.24452444069537138
rnss : 0.5436722021069605
---sentence1---
jsd : 0.17353624288697667
rnss : 0.410847369767361
---sentence2---
jsd : 0.2282290179903881
rnss : 0.4579709166845515
---sentence3---
jsd : 0.14097383707631456
rnss : 0.36138884536564614
---sentence4---
jsd : 0.00918333694541833
rnss : 0.03370284956174592
---sentence5---
jsd : 0.08640814595582118
rnss : 0.22996776507030414
---sentence6---
jsd : 0.021896961669985116
rnss : 0.07077562602086096
---sentence7---
jsd : 0.053550198184716395
rnss : 0.11560270930761844
---sentence8---
jsd : 0.14263044702167949
rnss : 0.2640077610154382
---sentence9---
jsd : 0.08297955604191999
rnss : 0.18087352754209787
---sentence10---
jsd : 0.1294710096234941
rnss : 0.23313840179153053
---sentence11---
jsd : 0.044507963813350856
rnss : 0.15257191921679944
---sentence12---
jsd : 0.013670349054529497
rnss : 0.09658519314521652
---sentence13---
jsd : 0.12960084408833603
rnss : 0.28070546279995756
---sentence14---
jsd : 0.01469484

jsd : 0.04187515888805521
rnss : 0.07273225408663524
---sentence473---
jsd : 0.0100214833144083
rnss : 0.06657457058896102
---sentence474---
jsd : 0.053366404938158765
rnss : 0.226321722194625
---sentence475---
jsd : 0.015889769694977303
rnss : 0.07982061809109826
---sentence476---
jsd : 0.02374809261245439
rnss : 0.1523813750598265
---sentence477---
jsd : 0.028951906864262494
rnss : 0.07461341729777352
---sentence478---
jsd : 0.010705247206387738
rnss : 0.09321640733838346
---sentence479---
jsd : 0.014990154256537658
rnss : 0.04382561596717864
---sentence480---
jsd : 0.00442373880769297
rnss : 0.06190965712391303
---sentence481---
jsd : 0.015073720591964608
rnss : 0.04546892721987095
---sentence482---
jsd : 0.04402436073010926
rnss : 0.20550467692487487
---sentence483---
jsd : 0.12742877853675086
rnss : 0.19984790966358704
---sentence484---
jsd : 0.017827488342470072
rnss : 0.10306326960804911
---sentence485---
jsd : 0.02042960445139971
rnss : 0.06888291450730966
---sentence486---
jsd

---sentence972---
jsd : 0.02293061405309653
rnss : 0.1321760035693524
---sentence973---
jsd : 0.03699848708943383
rnss : 0.1872971322770052
---sentence974---
jsd : 0.013890307107317756
rnss : 0.06091050615455993
---sentence975---
jsd : 0.05257831942025049
rnss : 0.12396195742501244
---sentence976---
jsd : 0.028410113398222123
rnss : 0.08646750176682076
---sentence977---
jsd : 0.0530780519453032
rnss : 0.20615857431644927
---sentence978---
jsd : 0.10464442006949518
rnss : 0.28557137397830745
---sentence979---
jsd : 0.10754669183769469
rnss : 0.20769652836955863
---sentence980---
jsd : 0.14663222302724355
rnss : 0.27467477789831984
---sentence981---
jsd : 0.07292696638924052
rnss : 0.20978451484432528
---sentence982---
jsd : 0.016525501649445107
rnss : 0.0886169620379023
---sentence983---
jsd : 0.13186498697993418
rnss : 0.2307955453367482
---sentence984---
jsd : 0.028977084114229085
rnss : 0.1787597162459283
---sentence985---
jsd : 0.01960035913542455
rnss : 0.07637084102815478
---sente

---sentence1472---
jsd : 0.014372471683515145
rnss : 0.10767622817706805
---sentence1473---
jsd : 0.08145036565035033
rnss : 0.14368892798275573
---sentence1474---
jsd : 0.13340013623798957
rnss : 0.3210215452298998
---sentence1475---
jsd : 0.04580218300071606
rnss : 0.19275804879262723
---sentence1476---
jsd : 0.01739725593147557
rnss : 0.06558714819162118
---sentence1477---
jsd : 0.03657907844852758
rnss : 0.17156886781046496
---sentence1478---
jsd : 0.04827258026123177
rnss : 0.1586189313858593
---sentence1479---
jsd : 0.04303117678623597
rnss : 0.21906786701905667
---sentence1480---
jsd : 0.015088702851417094
rnss : 0.028224695899774964
---sentence1481---
jsd : 0.02612578381454743
rnss : 0.121442613829077
---sentence1482---
jsd : 0.08737312386911603
rnss : 0.2632845543886523
---sentence1483---
jsd : 0.06628251182666667
rnss : 0.188262691383474
---sentence1484---
jsd : 0.21196358207749438
rnss : 0.4044283464812391
---sentence1485---
jsd : 0.01696156073277594
rnss : 0.089781005549341

In [60]:
def lstm(X_train, X_test, y_train, y_test):
    X1_train = X_train['texts']
#     X1_train = X_train[:,[-1]]
    X1_train = [str (item) for item in X1_train]
    
    X1_test = X_test['texts']
#     X1_test = X_test[:,[-1]]
    X1_test = [str (item) for item in X1_test]

    X2_train = X_train[['round','sender_num']].values
#     X2_train = X_train[:,0:7]
    X2_test = X_test[['round','sender_num']].values
#     X2_test = X_test[:,0:7]
    
    token = Tokenizer(num_words = 20000)
    token.fit_on_texts(X1_train)
    print(token.document_count)

    x_train_seq = token.texts_to_sequences(X1_train)
    x_test_seq = token.texts_to_sequences(X1_test)
    X1_train = sequence.pad_sequences(x_train_seq, maxlen = 150)
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)

#     y_one_train = np_utils.to_categorical(y_train)
#     y_one_test = np_utils.to_categorical(y_test)
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=7,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=7, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=128, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return train_history, pre_probability
    '''
    from sklearn import metrics
    print("Classification report for classifier:\n%s\n"
          % ( metrics.classification_report(y_test, predicted)))

    print("f1_score :\n%s\n" % ( metrics.f1_score(y_test, predicted, average='macro')))
    print("acc_score :\n%s\n" % ( metrics.accuracy_score(y_test, predicted)))
    '''
    

In [61]:
train_history_ls , pre_probability_ls = lstm(X_train, X_test, y_train, y_test)

15400
Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 150, 32)      640000      input_17[0][0]                   
__________________________________________________________________________________________________
dropout_17 (Dropout)            (None, 150, 32)      0           embedding_9[0][0]                
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 16)           3136        dropout_17[0][0]                 
______________________________________________________________________________________

In [None]:
for i in range(len(o)):
    print('---sentence{0}---'.format(i))
    print('jsd :', jensen_shannon_div(pre_probability_ls[i], o[i]))
    print('rnss :', root_normalized_squared_error(pre_probability_ls[i], o[i]))