# NTCIR-15 DialEval model note
###### Created by Weber Huang 2020-06-15
###### TMUNLP

## Table of content
#### 1. JSON to dataframe
#### 2. Dataset cleaning
#### 3. Segmentation
#### 4. Generate the output of preprocessing
#### 5. Modeling
#### 6. Evaluation
#### 7. Generate the submission estimation JSON

> How to use this note?
     
+ In this ipython file, the model will be train autometically. Feel free to adjust the parameter of those models If you want. I'd suggest you to directly switch to section 5.       
+ If you want to test the evaluation, here is the way:       
    1. Run the cell to section 4 first, use **generate_dataset(name, wd, stop_word_path)** to process raw json to dataframe for both train and dev.
    2. Run section 5 and 6.  
    3. In section 6, there is a function **Generate_submission(dev, model_1, model_2)**, plz make sure that both 2 model are same, like both are cnn or lst.   
    4. Save the submission copy and test it with ground_truth through eval.py     
    
> 找時間再把這隻程式包成class zzz，自定義函數太多傷眼睛請多包涵@@

In [1]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
from sklearn import preprocessing
import jieba
import re

### 1.JSON to dataframe

In [2]:
def get_nugget(dataframe):   
    target = {'nugget'}
    dicts = []
    for item in tqdm(dataframe['annotations']):
        sub_dicts = []
        for element in item:
            sub_dicts.append({key:value for key,value in element.items() if key in target}['nugget'])
        dicts.append(sub_dicts)

    dataframe['nuggets'] = dicts # dis is for anno nugget list
    
    return dataframe

def shaping(dataframe):
    length = []
    for i in tqdm(dataframe['turns']):
        length.append(len(i))
    Id = dataframe['id'].tolist()

    Fin_Id = sum([[s] * n for s, n in zip(Id, length)], [])

    turns_list = dataframe['turns'].tolist()
    
    Fin_turns_anno = []
    for x,y in tqdm(zip(turns_list,dataframe['nuggets'])):
        for q in range(len(x)):
            Fin_turns_anno.append(list(x[q].values())+[i[q] for i in y])
    
    return Fin_Id, Fin_turns_anno

def stacking(Fin_Id, Fin_turns_anno):    
    train_clean = pd.DataFrame({'id': Fin_Id,'info': Fin_turns_anno})
    # train_clean.head()
    train_df = pd.DataFrame(train_clean['info'].values.tolist(), columns=['sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19'])
    train_df['id'] = train_clean['id']
    train_df = train_df[['id','sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19']]
    
    return train_df

### 2.Dataset cleaning

In [3]:
def process_data(dataframe):
    
    # id to str
    dataframe['id'] = dataframe['id'].apply(str)
    
    
    # round
    uni = dataframe.id.unique()
    num = []
    for i in uni:
        count = 1
        for j in dataframe['id']:
            if i == j:
                num.append(count)
                count += 1
            else:
                continue

    dataframe['round'] = num
    
    
    # distribution
    Nugget_types = ['CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']
    arr = np.array(dataframe.iloc[:,3:22]) 
    dicts = []
    tmp = []
    
    for i in arr:
        c = Counter(i)
        dicts.append(c)
        
    for i in dicts:
        test = []
        for n in Nugget_types:
            test.append(i.get(n,0)/19)
        tmp.append(test)
        
    tmp = np.array(tmp)
    for i in range(len(Nugget_types)):
        dataframe[Nugget_types[i]] = tmp[:,i]
        
        
    # round_max (round_label)
    f = dataframe.groupby('round').sum()
    out = list(f.idxmax(axis=1))

    round_max = []
    for i in dataframe['round']:
        for j in range(1,8):
            if i == j:
                round_max.append(out[j-1])
            else:
                continue
    dataframe['round_max'] = round_max
    
    
    # label encoding (round_label)
    
    le = preprocessing.LabelEncoder()
    le.fit(Nugget_types);
    round_label = le.transform(list(dataframe['round_max']))
    dataframe['round_label'] = round_label
    
    
    # label encoding (sender_num)
    sender = ['customer','helpdesk']
    l = preprocessing.LabelEncoder()
    l.fit(sender);
    sender_num = l.transform(list(dataframe['sender']))
    dataframe['sender_num'] = sender_num
    
    subset = dataframe[['id','sender','sender_num','utterance','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 3.Segmentation

In [4]:
def segment(dataframe, file_path):
    
    texts = dataframe['utterance'].astype(str)
    
    seg_texts = []
    for line in texts:
        seg_content = ' '.join(jieba.cut(line, cut_all = False))
        seg_texts.append(seg_content)
        
    def remove_punctuation(line):
        rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
        line = rule.sub(' ',line)
        return line
    
    texts = []
    for line in seg_texts:
        new_line = remove_punctuation(line).split()
        texts.append(new_line)
        
    cn_stopwords = []
    with open(file_path, 'r', encoding='UTF-8') as file:
        for data in file.read().splitlines():
            cn_stopwords.append(data)
            
    # remove punctuation
    pp_texts = []
    for line in texts:
        line_noSW = []
        for word in line:
            if word not in cn_stopwords:
                line_noSW.append(word)
        pp_texts.append(line_noSW)
    
    # change emoji in pp_texts to *
    for line in pp_texts:
        if line == []:
            line.append("*")
            
    # concatenate the sentences by whitespace
    new_texts = []
    for sentence in pp_texts:
        series_sentence = " ".join(word for word in sentence)
        new_texts.append(series_sentence)
    
    dataframe['texts'] = new_texts
    
    subset = dataframe[['id','sender','sender_num','texts','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 4.Generate the output of preprocessing

In [5]:
def generate_dataset(name, wd, stop_word_path):
    os.chdir(wd)
    file = pd.read_json(name, encoding='utf8')
    nu = get_nugget(file)
    Id, anno = shaping(nu)
    output = stacking(Id, anno)
    fin = process_data(output)
    seg = segment(fin, stop_word_path)
    
    return seg

###### Plz feed the raw_json, working directory and stop_word file in the generate_dataset()

In [11]:
# save file
# import time
# path = 'C:/Users/doudi/OneDrive/Documents/stc3-dataset/data/'
# timestr = time.strftime("%Y%m%d%H%M")
# output.to_csv((path + timestr + '_train_data_cn.csv'), index=False, encoding='utf_8_sig')

In [6]:
train = generate_dataset(r'train_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`





HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\doudi\AppData\Local\Temp\jieba.cache
Loading model cost 0.675 seconds.
Prefix dict has been built succesfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sender,sender_num,texts,round,round_max,round_label,CNUG0,CNUG,CNUG*,CNaN,HNUG,HNUG*,HNaN
0,3830401296796826,customer,0,中国电信 控制箱 没有 维护 信息安全 人身安全 保障 好好 修修 应该 差钱 位于 济南市...,1,CNUG0,2,0.736842,0.052632,0.0,0.210526,0.0,0.0,0.0
1,3830401296796826,helpdesk,1,您好 反映 情况 认真 记录 会 及时 相关 部门 反馈 敬请 等待,2,HNUG,4,0.0,0.0,0.0,0.0,0.473684,0.0,0.526316
2,3830772740080373,customer,0,电信服务 广州市 白云区 太和 镇 谢家 庄 二队 电信 信号 差 投诉 几年 没人 处理 ...,1,CNUG0,2,0.736842,0.0,0.0,0.263158,0.0,0.0,0.0
3,3830772740080373,helpdesk,1,您好 中国电信 广东 客服 关注 反映 问题 请 详细描述 一下 问题 处理 谢谢,2,HNUG,4,0.0,0.0,0.0,0.0,0.526316,0.052632,0.421053
4,3830772740080373,customer,0,图片 知道 起码 通信 信号 没有 更 3G 上网 信号 一年 前 两年 前 第一次 电话 ...,3,CNUG,0,0.052632,0.736842,0.0,0.210526,0.0,0.0,0.0


In [33]:
dev = generate_dataset(r'dev_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sender,sender_num,texts,round,round_max,round_label,CNUG0,CNUG,CNUG*,CNaN,HNUG,HNUG*,HNaN
0,4227729258237823,customer,0,内涵 段子 联通 皮 点赞 中国联通 中国联通 客服 掌上 营业厅 内涵 段子 话题 封 郑...,1,CNUG0,2,0.157895,0.052632,0.0,0.789474,0.0,0.0,0.0
1,4227729258237823,helpdesk,1,u,2,HNUG,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105
2,4227729258237823,customer,0,夸夸,3,CNUG,0,0.0,0.157895,0.0,0.842105,0.0,0.0,0.0
3,4227729258237823,helpdesk,1,*,4,HNUG,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105
4,4121001149457182,customer,0,距离 反映 问题 已经 一个 星期 花粉 助手 D 荣耀 honor 荣耀 手机 华为 终端...,1,CNUG0,2,0.789474,0.052632,0.0,0.157895,0.0,0.0,0.0


> See if the file are same 

In [25]:
# count = 0
# if output.equals(output_15_tr) == True:
#     print("DialEval-14 training data is as same as DialEval-15 training data")
# else:
#     print("DialEval-14 training data is not as same as DialEval-15 training data")

    
# output.iloc[0] == output_15_tr.iloc[0]

# # print("There are {0} rows in 2 datasets as same".format(count))
# # print("\n")
# # print("There are {0} rows in 2 datasets as different".format((len(output)-count)))

DialEval-14 training data is as same as DialEval-15 training data


### 5. Modeling

#### 5.1 Write some necessory def function 

In [8]:
from scipy import stats
import tensorflow as tf

def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
#     y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    # loss = (stats.entropy(y_pred, m, base=2) + stats.entropy(y_true, m, base=2)) / 2.
    # tf.keras.losses.KLD()
    loss = (tf.keras.losses.KLD(y_pred, m) + tf.keras.losses.KLD(y_true, m)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

def rnss_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(y_pred, y_true):
        return ((y_pred - y_true) ** 2).sum()

#     y_pred, y_true = normalize(y_pred, y_true)
    loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
    return loss
  
# model.compile(loss=custom_loss, optimizer='adam')from scipy import stats
import tensorflow as tf

def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
#     y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    # loss = (stats.entropy(y_pred, m, base=2) + stats.entropy(y_true, m, base=2)) / 2.
    # tf.keras.losses.KLD()
    loss = (tf.keras.losses.KLD(y_pred, m) + tf.keras.losses.KLD(y_true, m)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

def rnss_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(y_pred, y_true):
        return ((y_pred - y_true) ** 2).sum()

#     y_pred, y_true = normalize(y_pred, y_true)
    loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
    return loss
  
# model.compile(loss=custom_loss, optimizer='adam')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import optimizers
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Dense, Embedding
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import concatenate
from keras import initializers
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


#### 5.2 Features preparation

In [328]:
# === customer
c_X_train = train_c.filter(['round','round_label','texts'])
c_X_test = dev_c.filter(['round','round_label','texts'])
    

y_train_c = train_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])
y_test_c = dev_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])

# y_train_h = train.filter(['HNUG','HNUG*','HNaN'])
# y_test_h = dev.filter(['HNUG','HNUG*','HNaN'])

c_X1_train = c_X_train['texts']
# c_X1_train = [str (item) for item in c_X1_train]
c_X1_test = c_X_test['texts']
# c_X1_test = [str (item) for item in c_X1_test]

c_X2_train = c_X_train[['round','round_label']].values
c_X2_test = c_X_test[['round','round_label']].values

c_token = Tokenizer(num_words = 20000)
c_token.fit_on_texts(c_X1_train)
c_vocab = c_token.word_index
print(c_token.document_count)

c_x_train_seq = c_token.texts_to_sequences(c_X1_train)
c_x_test_seq = c_token.texts_to_sequences(c_X1_test)
c_X1_train = sequence.pad_sequences(c_x_train_seq, maxlen = 150)
c_X1_test = sequence.pad_sequences(c_x_test_seq, maxlen = 150)

8500


In [None]:
# === helpdesk
h_X_train = train_h.filter(['round','round_label','texts'])
h_X_test = dev_h.filter(['round','round_label','texts'])
    

# y_train_c = train_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])
# y_test_c = dev_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])

y_train_h = train_h.filter(['HNUG','HNUG*','HNaN'])
y_test_h = dev_h.filter(['HNUG','HNUG*','HNaN'])

h_X1_train = h_X_train['texts']
# h_X1_train = [str (item) for item in h_X1_train]
h_X1_test = h_X_test['texts']
# h_X1_test = [str (item) for item in h_X1_test]

h_X2_train = h_X_train[['round','round_label']].values
h_X2_test = h_X_test[['round','round_label']].values

h_token = Tokenizer(num_words = 20000)
h_token.fit_on_texts(h_X1_train)
h_vocab = h_token.word_index
print(h_token.document_count)

h_x_train_seq = h_token.texts_to_sequences(h_X1_train)
h_x_test_seq = h_token.texts_to_sequences(h_X1_test)
h_X1_train = sequence.pad_sequences(h_x_train_seq, maxlen = 150)
h_X1_test = sequence.pad_sequences(h_x_test_seq, maxlen = 150)

#### 5.3 textCNN for customer and helpdesk respectively

In [398]:
# === customer
def CNN_C(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):   
    
    num_labels = 4
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(c_vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=4, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

In [399]:
# === helpdesk
def CNN_H(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
    
    num_labels = 2
    main_input = Input(shape=(150,), dtype='float64')

    sub_input = Input(shape=(2,))
    
    # pre-train embeddings
    # embedder = Embedding(len(vocab) + 1, 300, input_length = 20, weights = [embedding_matrix], trainable = False)
    # embed = embedder(main_input)
    embed = Embedding(len(h_vocab)+1, 300, input_length=150)(main_input)
    # filter size, region size
    cnn = Convolution1D(2, 2, padding='same', strides = 1, activation='relu')(embed)
    cnn = MaxPool1D(pool_size=4)(cnn)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    # main_output = Dense(num_labels, activation='sigmoid')(drop)


    dense_1 = Dense(units=256,activation='relu')(sub_input)
    drop_1 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(drop_1)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([drop, dense_2])
    dense_3 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=3, activation='softmax')(dense_3)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

#### 5.4 LSTM for customer and helpdesk respectively

In [400]:
# === customer
def lstm_C(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=4,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=4, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

In [401]:
# === helpdesk
def lstm_H(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
    main_input = Input(shape=(150,), dtype='float64')
    sub_input = Input(shape=(2,))
    
    embed = Embedding(output_dim=32,input_dim=20000,input_length=150)(main_input)
    dropout_1 = Dropout(0.35)(embed)
    lst = LSTM(units=16)(dropout_1)
    dense_1 = Dense(units=256,activation='relu')(lst)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=128,activation='relu')(dropout_2)
    dense_3 = Dense(units=3,activation='softmax')(dense_2)


    dense_4 = Dense(units=256,activation='relu')(sub_input)
    dropout_3 = Dropout(0.35)(dense_4)
    dense_5 = Dense(units=128,activation='relu')(dropout_3)
    # sub_output = Dense(units=2,activation='sigmoid')(dense_2)

    merge = concatenate([dense_3, dense_5])
    dense_6 = Dense(units=10, activation='relu')(merge)
    output = Dense(units=3, activation='softmax')(dense_6)

    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    # filepath="C:/Users/doudi/OneDrive/Documents/TMU-GIDS/Lab/Competition/AI cup 2019/weights.best.hdf5"
    # checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=10, 
                              batch_size=64, verbose=2, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

### 6. Evaluation

#### 6.1 Subsetting training

In [402]:
# split from sender
train_c = train[train.sender=='customer']
train_h = train[train.sender=='helpdesk']
dev_c = dev[dev.sender=='customer']
dev_h = dev[dev.sender=='helpdesk']

In [403]:
CNN_c_model, CNN_c_history, CNN_c_pred = CNN_C(c_X1_train, c_X2_train, c_X1_test, 
                                               c_X2_test, y_train_c, y_test_c, 
                                               loss = jsd_custom_loss)

Model: "model_22"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_45 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 150, 300)     5866800     input_45[0][0]                   
__________________________________________________________________________________________________
conv1d_18 (Conv1D)              (None, 150, 2)       1202        embedding_22[0][0]               
__________________________________________________________________________________________________
input_46 (InputLayer)           (None, 2)            0                                            
___________________________________________________________________________________________

In [406]:
CNN_h_model, CNN_h_history, CNN_h_pred = CNN_H(h_X1_train, h_X2_train, h_X1_test, 
                                               h_X2_test, y_train_h, y_test_h, 
                                               loss = jsd_custom_loss)

Model: "model_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_51 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 150, 300)     1982100     input_51[0][0]                   
__________________________________________________________________________________________________
conv1d_19 (Conv1D)              (None, 150, 2)       1202        embedding_25[0][0]               
__________________________________________________________________________________________________
input_52 (InputLayer)           (None, 2)            0                                            
___________________________________________________________________________________________

In [405]:
lstm_c_model, lstm_c_history, lstm_c_pred = lstm_C(c_X1_train, c_X2_train, c_X1_test, 
                                               c_X2_test, y_train_c, y_test_c, 
                                               loss = jsd_custom_loss)

Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 150, 32)      640000      input_49[0][0]                   
__________________________________________________________________________________________________
dropout_52 (Dropout)            (None, 150, 32)      0           embedding_24[0][0]               
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 16)           3136        dropout_52[0][0]                 
___________________________________________________________________________________________

In [404]:
lstm_h_model, lstm_h_history, lstm_h_pred = lstm_H(h_X1_train, h_X2_train, h_X1_test, 
                                               h_X2_test, y_train_h, y_test_h,  
                                               loss = jsd_custom_loss)

Model: "model_23"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_47 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 150, 32)      640000      input_47[0][0]                   
__________________________________________________________________________________________________
dropout_49 (Dropout)            (None, 150, 32)      0           embedding_23[0][0]               
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, 16)           3136        dropout_49[0][0]                 
___________________________________________________________________________________________

In [397]:
h_X1_train.shape

(6900, 150)

#### 6.2 Output by rbind

In [339]:
# predict single row
def padding_single_c(dev):
    X_test = dev.filter(['round','round_label','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round','round_label']].values
    
    x_test_seq = c_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)
    
    return X1_test, X2_test

def padding_single_h(dev):
    X_test = dev.filter(['round','round_label','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round','round_label']].values
    
    x_test_seq = h_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 150)
    
    return X1_test, X2_test

In [409]:
from itertools import chain

# input the development dataframe and the method
# for current models (loss = jsd):
# model_1 = customer model (CNN_c_model, lstm_c_model)
# model_2 = helpdesk model (CNN_h_model, lstm_h_model)
def Generate_submission(dev, model_1, model_2):
    Id_list = dev['id'].unique()
    C_nugget = ['CNUG','CNUG*','CNUG0','CNaN']
    H_nugget = ['HNUG','HNUG*','HNaN']

    final = []
    
    # go through each Id first
    for Id in tqdm(Id_list):  
        result = []
        
        for i in range(len(dev)):
            
            # if Id is match than predict the prob_distribution and zip it as dictionary 
            if dev['id'][i] == Id:
                if dev.iloc[i, 1] == 'customer':
                    t1, t2 = padding_single_c(dev.iloc[i])
                    t2 = np.array(t2).reshape(1,2)
                    cus_prob = model_1.predict(x=[t1, t2])
                    cus_prob = cus_prob.tolist()
                    cus_prob = list(chain(*cus_prob))
                    dict_c = dict(zip(C_nugget, cus_prob))
                    result.append(dict_c)
                else:
                    t3, t4 = padding_single_h(dev.iloc[i])
                    t4 = np.array(t4).reshape(1,2)
                    help_prob = model_2.predict(x=[t3, t4])
                    help_prob = help_prob.tolist()
                    help_prob = list(chain(*help_prob))
                    dict_h = dict(zip(H_nugget, help_prob))
                    result.append(dict_h)
            # if Id isn't match than continue until it match or switch to new Id
            else:
                continue
        
        # Submission form
        dict1 = {'nugget':result,'id':Id}
        final.append(dict1)
        
    return final

In [410]:
Generate_submission(dev, CNN_c_model, CNN_h_model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




[{'nugget': [{'CNUG': 0.007560058496892452,
    'CNUG*': 0.00039969218778423965,
    'CNUG0': 0.7787271738052368,
    'CNaN': 0.213313028216362},
   {'HNUG': 0.680847704410553,
    'HNUG*': 0.049050670117139816,
    'HNaN': 0.2701016068458557},
   {'CNUG': 0.6138875484466553,
    'CNUG*': 0.04052926227450371,
    'CNUG0': 0.011808176524937153,
    'CNaN': 0.33377501368522644},
   {'HNUG': 0.6252146363258362,
    'HNUG*': 0.05559347942471504,
    'HNaN': 0.3191918730735779}],
  'id': '4227729258237823'},
 {'nugget': [{'CNUG': 0.005607348866760731,
    'CNUG*': 0.00021252661827020347,
    'CNUG0': 0.7731326222419739,
    'CNaN': 0.22104743123054504},
   {'HNUG': 0.5048636794090271,
    'HNUG*': 0.3999703526496887,
    'HNaN': 0.09516590088605881},
   {'CNUG': 0.6797991991043091,
    'CNUG*': 0.02557622641324997,
    'CNUG0': 0.0061413682997226715,
    'CNaN': 0.2884831726551056},
   {'HNUG': 0.6252146363258362,
    'HNUG*': 0.05559347942471504,
    'HNaN': 0.3191918730735779},
   {'CNUG'

### 7. Generate the submission estimation JSON

In [373]:
import json
import time

path = 'C:/Users/doudi/OneDrive/Documents/ntcir15/eval'
os.chdir(path)
timestr = time.strftime("%Y%m%d%H%M")

In [374]:
with open((timestr + '_' + 'dev_eval.json'), 'w', encoding='utf-8') as f: 
    f.write(json.dumps(final, ensure_ascii=False, indent=2))

>> below is the debugging zone.... ignore them plz zzz