# Dialogue Evaluation
###### Created by Weber Huang 

## Table of content
#### 0. Embedding
#### 1. JSON to dataframe
#### 2. Dataset cleaning
#### 3. Segmentation
#### 4. Generate the output of preprocessing
#### 5. Modeling
#### 6. Evaluation
#### 7. Generate the submission estimation JSON

> How to use this note?
     
+ In this ipython file, the model will be train autometically. Feel free to adjust the parameter of those models If you want. I'd suggest you to directly switch to section 5.       
+ If you want to test the evaluation, here is the way:       
    1. Run the cell  section 1 to section 4 first, and use **generate_dataset(name, wd, stop_word_path)** for trainset and devset and **test_preprocess(name, wd, stop_word_path)** for testset to process raw json to dataframe.
    2. Run section 5 and 6.  
    3. In section 6, there is a function **Generate_submission(dev, model_1, model_2)**, plz make sure that both 2 model are same, like both are cnn or lst.   
    4. Save the submission copy and test it with ground_truth through eval.py     
    

### 0.Embedding

In [22]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
from sklearn import preprocessing
import jieba
import re

### 1.JSON to dataframe

In [2]:
def get_nugget(dataframe):   
    target = {'nugget'}
    dicts = []
    for item in tqdm(dataframe['annotations']):
        sub_dicts = []
        for element in item:
            sub_dicts.append({key:value for key,value in element.items() if key in target}['nugget'])
        dicts.append(sub_dicts)

    dataframe['nuggets'] = dicts # dis is for anno nugget list
    
    return dataframe

def shaping(dataframe):
    length = []
    for i in tqdm(dataframe['turns']):
        length.append(len(i))
    Id = dataframe['id'].tolist()

    Fin_Id = sum([[s] * n for s, n in zip(Id, length)], [])

    turns_list = dataframe['turns'].tolist()
    
    Fin_turns_anno = []
    for x,y in tqdm(zip(turns_list,dataframe['nuggets'])):
        for q in range(len(x)):
            Fin_turns_anno.append(list(x[q].values())+[i[q] for i in y])
    
    return Fin_Id, Fin_turns_anno

def stacking(Fin_Id, Fin_turns_anno):    
    train_clean = pd.DataFrame({'id': Fin_Id,'info': Fin_turns_anno})
    # train_clean.head()
    train_df = pd.DataFrame(train_clean['info'].values.tolist(), columns=['sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19'])
    train_df['id'] = train_clean['id']
    train_df = train_df[['id','sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19']]
    
    return train_df

### 2.Dataset cleaning

In [3]:
def process_data(dataframe):
    
    # id to str
    dataframe['id'] = dataframe['id'].apply(str)
    
    
    # round
    uni = dataframe.id.unique()
    num = []
    for i in uni:
        count = 1
        for j in dataframe['id']:
            if i == j:
                num.append(count)
                count += 1
            else:
                continue

    dataframe['round'] = num
    
    
    # distribution
    Nugget_types = ['CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']
    arr = np.array(dataframe.iloc[:,3:22]) 
    dicts = []
    tmp = []
    
    for i in arr:
        c = Counter(i)
        dicts.append(c)
        
    for i in dicts:
        test = []
        for n in Nugget_types:
            test.append(i.get(n,0)/19)
        tmp.append(test)
        
    tmp = np.array(tmp)
    for i in range(len(Nugget_types)):
        dataframe[Nugget_types[i]] = tmp[:,i]
        
        
    # round_max (round_label)
#     f = dataframe.groupby('round').sum()
#     out = list(f.idxmax(axis=1))

#     round_max = []
#     for i in dataframe['round']:
#         for j in range(1,8):
#             if i == j:
#                 round_max.append(out[j-1])
#             else:
#                 continue
#     dataframe['round_max'] = round_max
    
    
    # label encoding (round_label)
    
#     le = preprocessing.LabelEncoder()
#     le.fit(Nugget_types);
#     round_label = le.transform(list(dataframe['round_max']))
#     dataframe['round_label'] = round_label
    
    
    # label encoding (sender_num)
    sender = ['customer','helpdesk']
    l = preprocessing.LabelEncoder()
    l.fit(sender);
    sender_num = l.transform(list(dataframe['sender']))
    dataframe['sender_num'] = sender_num
    
    subset = dataframe[['id','sender','sender_num','utterance','round',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 3.Segmentation

In [4]:
def segment(dataframe, file_path):
    
    texts = dataframe['utterance'].astype(str)
    
    seg_texts = []
    for line in texts:
        seg_content = ' '.join(jieba.cut(line, cut_all = False))
        seg_texts.append(seg_content)
        
    def remove_punctuation(line):
        rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
        line = rule.sub(' ',line)
        return line
    
    texts = []
    for line in seg_texts:
        new_line = remove_punctuation(line).split()
        texts.append(new_line)
        
    cn_stopwords = []
    with open(file_path, 'r', encoding='UTF-8') as file:
        for data in file.read().splitlines():
            cn_stopwords.append(data)
            
    # remove punctuation
    pp_texts = []
    for line in texts:
        line_noSW = []
        for word in line:
            if word not in cn_stopwords:
                line_noSW.append(word)
        pp_texts.append(line_noSW)
    
    # change emoji in pp_texts to *
    for line in pp_texts:
        if line == []:
            line.append("*")
            
    # concatenate the sentences by whitespace
    new_texts = []
    for sentence in pp_texts:
        series_sentence = " ".join(word for word in sentence)
        new_texts.append(series_sentence)
    
    dataframe['texts'] = new_texts
    
    subset = dataframe[['id','sender','sender_num','texts','round',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 4.Generate the output of preprocessing

In [5]:
# from raw_json to dataframe
def generate_dataset(name, wd, stop_word_path):
    os.chdir(wd)
    file = pd.read_json(name, encoding='utf8')
    nu = get_nugget(file)
    Id, anno = shaping(nu)
    output = stacking(Id, anno)
    fin = process_data(output)
    seg = segment(fin, stop_word_path)
    
    return seg

In [6]:
# This is for test data preprocess only!
def pre_test(dataframe):    
    length = []
    for i in tqdm(dataframe['turns']):
        length.append(len(i))
    Id = dataframe['id'].tolist()

    Fin_Id = sum([[s] * n for s, n in zip(Id, length)], [])
    turns_list = dataframe['turns'].tolist()
    Fin_turns_anno = []
    for x in tqdm(turns_list):
        for q in range(len(x)):
            Fin_turns_anno.append(list(x[q].values()))

    train_clean = pd.DataFrame({'id': Fin_Id,'info': Fin_turns_anno})
    train_df = pd.DataFrame(train_clean['info'].values.tolist(), columns=['sender','utterance'])
    train_df['id'] = train_clean['id']
    train_df = train_df[['id','sender','utterance']]

    # id to str
    train_df['id'] = train_df['id'].apply(str) 
    # round
    uni = train_df.id.unique()
    num = []
    for i in uni:
        count = 1
        for j in train_df['id']:
            if i == j:
                num.append(count)
                count += 1
            else:
                continue

    train_df['round'] = num
    # label encoding (sender_num)
    sender = ['customer','helpdesk']
    l = preprocessing.LabelEncoder()
    l.fit(sender);
    sender_num = l.transform(list(train_df['sender']))
    train_df['sender_num'] = sender_num
    subset = train_df[['id','sender','sender_num','utterance','round']]
    return subset

def segment_2(dataframe, file_path):
    texts = dataframe['utterance'].astype(str)
    seg_texts = []
    for line in texts:
        seg_content = ' '.join(jieba.cut(line, cut_all = False))
        seg_texts.append(seg_content)
    def remove_punctuation(line):
        rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
        line = rule.sub(' ',line)
        return line
    texts = []
    for line in seg_texts:
        new_line = remove_punctuation(line).split()
        texts.append(new_line)  
    cn_stopwords = []
    with open(file_path, 'r', encoding='UTF-8') as file:
        for data in file.read().splitlines():
            cn_stopwords.append(data)
    # remove punctuation
    pp_texts = []
    for line in texts:
        line_noSW = []
        for word in line:
            if word not in cn_stopwords:
                line_noSW.append(word)
        pp_texts.append(line_noSW)
    # change emoji in pp_texts to *
    for line in pp_texts:
        if line == []:
            line.append("*")      
    # concatenate the sentences by whitespace
    new_texts = []
    for sentence in pp_texts:
        series_sentence = " ".join(word for word in sentence)
        new_texts.append(series_sentence)
    dataframe['texts'] = new_texts
    subset = dataframe[['id','sender','sender_num','texts','round',]]
    return subset

def test_preprocess(name, wd, stop_word_path):
    os.chdir(wd)
    file = pd.read_json(name, encoding='utf8')
    t = pre_test(file)
    tmp = segment_2(t, stop_word_path)
    return tmp

###### Plz feed the raw_json, working directory and stop_word file in the generate_dataset()

In [78]:
# save file
# import time
# path = 'C:/Users/doudi/OneDrive/Documents/stc3-dataset/data/'
# timestr = time.strftime("%Y%m%d%H%M")
# output.to_csv((path + timestr + '_train_data_cn.csv'), index=False, encoding='utf_8_sig')

In [7]:
train = generate_dataset(r'train_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`





HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\doudi\AppData\Local\Temp\jieba.cache
Loading model cost 0.691 seconds.
Prefix dict has been built succesfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
dev = generate_dataset(r'dev_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
test = test_preprocess(r'test_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




In [10]:
def import_w2v(name, wd):    
    os.chdir(wd)
    f = pd.read_csv(name, encoding='utf-8')
    df = pd.DataFrame(f['614083 300'].str.split(' ',1).tolist(),
                                     columns = ['flips','row'])
    df = df.rename(columns={'flips':'word', 'row':'vector'})
    
    from opencc import OpenCC
    s = []
    cc = OpenCC('t2s')
    for i in tqdm(df['word']):
        s.append(cc.convert(i))
    df['simp'] = s
    
    return df
name = 'zh_wiki_word2vec_300.txt'
wd = 'C:\\Users\\doudi\\Downloads\\zh_wiki_word2vec_300'
W2V = import_w2v(name, wd)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=614083.0), HTML(value='')))




In [69]:
W2V.head()

Unnamed: 0,word,vector,simp
0,歐幾里,-0.105163544 -0.03062032 0.022532381 -0.043694...,欧几里
1,得,-0.07968008 0.3307005 -0.13564445 0.065610155 ...,得
2,西元前,0.3526686 0.1318678 -0.22770336 -0.032585304 -...,西元前
3,三世,0.20732123 0.19093925 -0.356956 0.16564949 -0....,三世
4,紀的,-0.11831374 0.6380424 -0.27801472 -0.09356173 ...,纪的


In [65]:
from itertools import chain
# transform word into vector via word2Vec
def trans(dataframe, vec):
    tmp_all = []
    for i in tqdm(dataframe['texts']):
        tmp = []
        for j in i.split():
            if j in vec['word'].to_list():
#                 print(j)
#                 print(vec[vec['word']==j]['vector'])
                match = list(chain(*vec[vec['word']==j]['vector'].str.split().tolist()))
                match = list(map(float, match))
                tmp.append(match)
#                 print(tmp)
            else:
                continue
        avg = [sum(x)/len(x) for x in zip(*tmp)]
#         print(avg)
        tmp_all.append(avg)
    dataframe['W2V'] = tmp_all
    return dataframe

In [14]:
train = trans(train, W2V)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=15400.0), HTML(value='')))




In [15]:
dev = trans(dev, W2V)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=1755.0), HTML(value='')))




In [17]:
# train.to_csv('C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Dataset\\New_DialEval-1\\train_w2v_cn.csv', index=False, encoding='utf_8_sig')
# dev.to_csv('C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Dataset\\New_DialEval-1\\dev_w2v_cn.csv', index=False, encoding='utf_8_sig')

In [23]:
train = pd.read_csv('C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Dataset\\New_DialEval-1\\train_baidu_cn.csv', encoding='utf_8')
dev = pd.read_csv('C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Dataset\\New_DialEval-1\\dev_baidu_cn.csv', encoding='utf_8')

In [24]:
train['Baidu_str'] = train['Baidu_str'].fillna('[]')
dev['Baidu_str'] = dev['Baidu_str'].fillna('[]')

In [25]:
def str_to_list(dataframe): 
    l = []
    n = []
    for i in dataframe['Baidu_str']:
        if i != '[]':
            
            l.append([float(j) for j in str(i).split()])
        else:
            l.append(n)
    return l

In [26]:
train["Baidu"] = str_to_list(train)
dev["Baidu"] = str_to_list(dev)

> See if the file are same 

In [83]:
# count = 0
# if output.equals(output_15_tr) == True:
#     print("DialEval-14 training data is as same as DialEval-15 training data")
# else:
#     print("DialEval-14 training data is not as same as DialEval-15 training data")

    
# output.iloc[0] == output_15_tr.iloc[0]

# # print("There are {0} rows in 2 datasets as same".format(count))
# # print("\n")
# # print("There are {0} rows in 2 datasets as different".format((len(output)-count)))

### 5. Modeling

#### 5.1 Write some necessory def function 

In [27]:
from scipy import stats
import tensorflow as tf

def normalize(pred, truth):
    """ convert inputs to np.array and make sure
    inputs are normalized probability distributions
    """
    if len(pred) != len(truth):
        raise ValueError("pred and truth have different lengths")
    if len(pred) == 0 or len(truth) == 0:
        raise ValueError("pred or truth are empty")

    pred, truth = np.asarray(pred), np.asarray(truth)
    if not ((pred >= 0).all() and (truth >= 0).all()):
        raise ValueError("probability distribution should not be negative")
    pred, truth = pred / pred.sum(), truth / truth.sum()
    return pred, truth

def jensen_shannon_div(pred, truth, base=2):
    ''' JSD: Jensen-Shannon Divergence
    '''
    pred, truth = normalize(pred, truth)
    m = 1. / 2 * (pred + truth)
    return (stats.entropy(pred, m, base=base)
            + stats.entropy(truth, m, base=base)) / 2.

def root_normalized_squared_error(pred, truth):
    """ RNSS: Root Normalised Sum of Squares
    """

    def squared_error(pred, truth):
        return ((pred - truth) ** 2).sum()

    pred, truth = normalize(pred, truth)
    return np.sqrt(squared_error(pred, truth) / 2)

def jsd_custom_loss(y_true, y_pred):
            
    # calculate loss, using y_pred
    ''' JSD: Jensen-Shannon Divergence
    '''
#     y_pred, y_true = normalize(y_pred, y_true)
    m = 1. / 2 * (y_pred + y_true)
    # loss = (stats.entropy(y_pred, m, base=2) + stats.entropy(y_true, m, base=2)) / 2.
    # tf.keras.losses.KLD()
    loss = (tf.keras.losses.KLD(y_pred, m) + tf.keras.losses.KLD(y_true, m)) / 2.
    return loss
  
# model.compile(loss=jsd_custom_loss, optimizer='adam')

# def rnss_custom_loss(y_true, y_pred):
            
#     # calculate loss, using y_pred
#     """ RNSS: Root Normalised Sum of Squares
#     """

#     def squared_error(y_pred, y_true):
#         return ((y_pred - y_true) ** 2).sum()

# #     y_pred, y_true = normalize(y_pred, y_true)
#     loss = np.sqrt(squared_error(y_pred, y_true) / 2)
    
#     return loss
  

In [28]:
# split from sender
train_c = train[train.sender=='customer']
train_h = train[train.sender=='helpdesk']
dev_c = dev[dev.sender=='customer']
dev_h = dev[dev.sender=='helpdesk']

In [29]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras import optimizers
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Input
from keras.layers import Dense, Embedding
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import concatenate, Bidirectional
from keras import initializers
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint

#### 5.2 Features preparation

In [30]:
# === customer
c_X_train = train_c.filter(['round','texts'])
c_X_test = dev_c.filter(['round','texts'])
    

y_train_c = train_c.filter(['CNUG0','CNUG','CNUG*','CNaN'])
y_test_c = dev_c.filter(['CNUG0','CNUG','CNUG*','CNaN'])

# y_train_h = train.filter(['HNUG','HNUG*','HNaN'])
# y_test_h = dev.filter(['HNUG','HNUG*','HNaN'])

c_X1_train = c_X_train['texts']
# c_X1_train = [str (item) for item in c_X1_train]
c_X1_test = c_X_test['texts']
# c_X1_test = [str (item) for item in c_X1_test]

c_X2_train = c_X_train[['round']].values
c_X2_test = c_X_test[['round']].values

c_token = Tokenizer(num_words = 20000)
c_token.fit_on_texts(c_X1_train)
c_vocab = c_token.word_index
print(c_token.document_count)

c_x_train_seq = c_token.texts_to_sequences(c_X1_train)
c_x_test_seq = c_token.texts_to_sequences(c_X1_test)
c_X1_train = sequence.pad_sequences(c_x_train_seq, maxlen = 350)
c_X1_test = sequence.pad_sequences(c_x_test_seq, maxlen = 350)

8500


In [31]:
# === helpdesk
h_X_train = train_h.filter(['round','texts'])
h_X_test = dev_h.filter(['round','texts'])
    

# y_train_c = train_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])
# y_test_c = dev_c.filter(['CNUG','CNUG*','CNUG0','CNaN'])

y_train_h = train_h.filter(['HNUG','HNUG*','HNaN'])
y_test_h = dev_h.filter(['HNUG','HNUG*','HNaN'])

h_X1_train = h_X_train['texts']
# h_X1_train = [str (item) for item in h_X1_train]
h_X1_test = h_X_test['texts']
# h_X1_test = [str (item) for item in h_X1_test]

h_X2_train = h_X_train[['round']].values
h_X2_test = h_X_test[['round']].values

h_token = Tokenizer(num_words = 20000)
h_token.fit_on_texts(h_X1_train)
h_vocab = h_token.word_index
print(h_token.document_count)

h_x_train_seq = h_token.texts_to_sequences(h_X1_train)
h_x_test_seq = h_token.texts_to_sequences(h_X1_test)
h_X1_train = sequence.pad_sequences(h_x_train_seq, maxlen = 350)
h_X1_test = sequence.pad_sequences(h_x_test_seq, maxlen = 350)

6900


> Word2Vec vector

In [32]:
def to_array(dataframe, length, dim=256):   
    train_vec = np.zeros(shape=(length,dim))
    for i in range(len(dataframe['Baidu'])):
        if dataframe['Baidu'].iloc[i] != []:
    #         print(i)
            train_vec[i] = dataframe['Baidu'].iloc[i]
        else:
            continue
    print(train_vec.shape)
    
    return train_vec

train_vec_c = to_array(train_c, len(train_c))
train_vec_h = to_array(train_h, len(train_h))
dev_vec_c = to_array(dev_c, len(dev_c))
dev_vec_h = to_array(dev_h, len(dev_h))

(8500, 256)
(6900, 256)
(975, 256)
(780, 256)


In [33]:
train_vec_c = np.reshape(train_vec_c, (train_vec_c.shape[0],1, train_vec_c.shape[1]))
train_vec_h = np.reshape(train_vec_h, (train_vec_h.shape[0],1, train_vec_h.shape[1]))
dev_vec_c = np.reshape(dev_vec_c, (dev_vec_c.shape[0],1,dev_vec_c.shape[1]))
dev_vec_h = np.reshape(dev_vec_h, (dev_vec_h.shape[0],1,dev_vec_h.shape[1]))


#### 5.3 textCNN for customer and helpdesk respectively

#### 5.4 LSTM for customer and helpdesk respectively

In [34]:
# === customer
def lstm_C(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
#     main_input = Input(shape=(350,), dtype='float64')
    main_input = Input(shape=(1,256))
    sub_input = Input(shape=(1,))
    
#     embed = Embedding(output_dim=300,input_dim=20000,input_length=350)(main_input)
#     dropout_1 = Dropout(0.35)(embed)
    lst = Bidirectional(LSTM(units=128,return_sequences=True))(main_input)
    lst2 = Bidirectional(LSTM(units=64))(lst)
    merge = concatenate([lst2, sub_input])
    dense_1 = Dense(units=32,activation='relu')(merge)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=16,activation='relu')(dropout_2)
    dense_3 = Dense(units=8,activation='relu')(dense_2)
    output = Dense(units=4,activation='softmax')(dense_3)


    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    filepath="C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Model file\\weights(c).best.hdf5"
    checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, 
                                save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=128, callbacks=callbacks_list, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

In [35]:
# === helpdesk
def lstm_H(X1_train, X2_train, X1_test, X2_test, y_train, y_test, loss='categorical_crossentropy'):
     
#     main_input = Input(shape=(350,), dtype='float64')
    main_input = Input(shape=(1,256))
    sub_input = Input(shape=(1,))
    
#     embed = Embedding(output_dim=300,input_dim=20000,input_length=350)(main_input)
#     dropout_1 = Dropout(0.35)(embed)
    lst = Bidirectional(LSTM(units=128,return_sequences=True))(main_input)
    lst2 = Bidirectional(LSTM(units=64))(lst)
    merge = concatenate([lst2, sub_input])
    dense_1 = Dense(units=32,activation='relu')(merge)
    dropout_2 = Dropout(0.35)(dense_1)
    dense_2 = Dense(units=16,activation='relu')(dropout_2)
    dense_3 = Dense(units=8,activation='relu')(dense_2)
    output = Dense(units=3,activation='softmax')(dense_3)



    model = Model(inputs=[main_input, sub_input], outputs=output)
    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
    print(model.summary())

    # checkpoint
    filepath="C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\Model file\\weights(h).best.hdf5"
    checkpoint= ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, 
                                save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    train_history = model.fit(x=[X1_train, X2_train], y=y_train, epochs=100, 
                              batch_size=128, callbacks=callbacks_list, verbose=1, validation_split=0.2)

    score = model.evaluate(x=[X1_test, X2_test], y=y_test, verbose=1)

    print("Test Score:", score[0])
    print("Test Accuracy:", score[1])

    pre_probability = model.predict(x=[X1_test, X2_test])
    predicted = pre_probability.argmax(axis=-1)
    
    return model, train_history, pre_probability

### 6. Evaluation

#### 6.1 Subsetting training

In [93]:
# CNN_c_model, CNN_c_history, CNN_c_pred = CNN_C(c_X1_train, c_X2_train, c_X1_test, 
#                                                c_X2_test, y_train_c, y_test_c, 
#                                                loss = jsd_custom_loss)

# CNN_h_model, CNN_h_history, CNN_h_pred = CNN_H(h_X1_train, h_X2_train, h_X1_test, 
#                                                h_X2_test, y_train_h, y_test_h, 
#                                                loss = jsd_custom_loss)

In [36]:
lstm_c_model, lstm_c_history, lstm_c_pred = lstm_C(train_vec_c, c_X2_train, dev_vec_c, 
                                               c_X2_test, y_train_c, y_test_c, 
                                               loss = jsd_custom_loss)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1, 256)       0                                            
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 1, 256)       394240      input_5[0][0]                    
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 128)          164352      bidirectional_5[0][0]            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
____________________________________________________________________________________________

InternalError: GPU sync failed

In [77]:
lstm_h_model, lstm_h_history, lstm_h_pred = lstm_H(train_vec_h, h_X2_train, dev_vec_h, 
                                               h_X2_test, y_train_h, y_test_h,  
                                               loss = jsd_custom_loss)

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 1, 256)       0                                            
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional (None, 1, 256)       394240      input_11[0][0]                   
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional (None, 128)          164352      bidirectional_11[0][0]           
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 1)            0                                            
____________________________________________________________________________________________


Epoch 00026: val_accuracy did not improve from 0.88478
Epoch 27/100

Epoch 00027: val_accuracy did not improve from 0.88478
Epoch 28/100

Epoch 00028: val_accuracy did not improve from 0.88478
Epoch 29/100

Epoch 00029: val_accuracy did not improve from 0.88478
Epoch 30/100

Epoch 00030: val_accuracy did not improve from 0.88478
Epoch 31/100

Epoch 00031: val_accuracy did not improve from 0.88478
Epoch 32/100

Epoch 00032: val_accuracy did not improve from 0.88478
Epoch 33/100

Epoch 00033: val_accuracy did not improve from 0.88478
Epoch 34/100

Epoch 00034: val_accuracy did not improve from 0.88478
Epoch 35/100

Epoch 00035: val_accuracy did not improve from 0.88478
Epoch 36/100

Epoch 00036: val_accuracy did not improve from 0.88478
Epoch 37/100

Epoch 00037: val_accuracy did not improve from 0.88478
Epoch 38/100

Epoch 00038: val_accuracy did not improve from 0.88478
Epoch 39/100

Epoch 00039: val_accuracy did not improve from 0.88478
Epoch 40/100

Epoch 00040: val_accuracy did not

#### 6.2 Output by rbind

In [30]:
# predict single row
def padding_single_c(dev):
    X_test = dev.filter(['round','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round']].values
    
    x_test_seq = c_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 350)
    
    return X1_test, X2_test

def padding_single_h(dev):
    X_test = dev.filter(['round','texts'])
    X1_test = X_test['texts']
#     X1_test = [str (item) for item in X1_test]
    X2_test = X_test[['round']].values
    
    x_test_seq = h_token.texts_to_sequences([X1_test])
    X1_test = sequence.pad_sequences(x_test_seq, maxlen = 350)
    
    return X1_test, X2_test

> word2Vec

In [42]:
dev.iloc[23]

id                           4210023888164273
sender                               customer
sender_num                                  0
texts         990007189146746 990007189146753
round                                       3
CNUG0                                       0
CNUG                                 0.947368
CNUG*                                       0
CNaN                                0.0526316
HNUG                                        0
HNUG*                                       0
HNaN                                        0
W2V                                        []
W2V_str                                    []
Baidu                                   [nan]
Baidu_str                                 NaN
Name: 23, dtype: object

In [65]:
def to_array_2(dataframe, length=1, dim=256):   
    train_vec = np.zeros(shape=(length,dim))
    if dataframe['Baidu'] != []:
        train_vec = np.array(dataframe['Baidu'])
    
#     print(train_vec.shape)
    train_vec = train_vec.reshape(1,256)
    
    return np.array(train_vec)

def padding_single_w2v(dev):
    X_test = dev.filter(['round','Baidu'])
    X1_test = to_array_2(X_test)
    X1_test = np.reshape(X1_test, (X1_test.shape[0],1,X1_test.shape[1]))
    X2_test = X_test[['round']].values
    return X1_test, X2_test

In [33]:
from itertools import chain

# input the development dataframe and the method
# for current models (loss = jsd):
# model_1 = customer model (CNN_c_model, lstm_c_model)
# model_2 = helpdesk model (CNN_h_model, lstm_h_model)
def Generate_submission(dev, model_1, model_2):
    Id_list = dev['id'].unique()
    C_nugget = ['CNUG','CNUG*','CNUG0','CNaN']
    H_nugget = ['HNUG','HNUG*','HNaN']

    final = []
    
    # go through each Id first
    for Id in tqdm(Id_list):  
        result = []
        
        for i in range(len(dev)):
            
            # if Id is match than predict the prob_distribution and zip it as dictionary 
            if dev['id'][i] == Id:
                if dev.iloc[i, 1] == 'customer':
                    t1, t2 = padding_single_c(dev.iloc[i])
                    t2 = np.array(t2).reshape(1,1)
                    cus_prob = model_1.predict(x=[t1, t2])
                    cus_prob = cus_prob.tolist()
                    cus_prob = list(chain(*cus_prob))
                    dict_c = dict(zip(C_nugget, cus_prob))
                    result.append(dict_c)
                else:
                    t3, t4 = padding_single_h(dev.iloc[i])
                    t4 = np.array(t4).reshape(1,1)
                    help_prob = model_2.predict(x=[t3, t4])
                    help_prob = help_prob.tolist()
                    help_prob = list(chain(*help_prob))
                    dict_h = dict(zip(H_nugget, help_prob))
                    result.append(dict_h)
            # if Id isn't match than continue until it match or switch to new Id
            else:
                continue
        
        # Submission form
        dict1 = {'nugget':result,'id':Id}
        final.append(dict1)
        
    return final

> Word2Vec

In [66]:
def Generate_submission_w2v(dev, model_1, model_2):
    dev['id'] = dev['id'].apply(str)
    Id_list = dev['id'].unique()
    C_nugget = ['CNUG','CNUG*','CNUG0','CNaN']
    H_nugget = ['HNUG','HNUG*','HNaN']

    final = []
    
    # go through each Id first
    for Id in tqdm(Id_list):  
        result = []
        
        for i in range(len(dev)):
            
            # if Id is match than predict the prob_distribution and zip it as dictionary 
            if dev['id'][i] == Id:
                if dev.iloc[i, 1] == 'customer':
#                     print(i)
                    t1, t2 = padding_single_w2v(dev.iloc[i])
                    t2 = np.array(t2).reshape(1,1)
                    cus_prob = model_1.predict(x=[t1, t2])
                    cus_prob = cus_prob.tolist()
                    cus_prob = list(chain(*cus_prob))
                    dict_c = dict(zip(C_nugget, cus_prob))
                    result.append(dict_c)
                else:
#                     print(i)
                    t3, t4 = padding_single_w2v(dev.iloc[i])
                    t4 = np.array(t4).reshape(1,1)
                    help_prob = model_2.predict(x=[t3, t4])
                    help_prob = help_prob.tolist()
                    help_prob = list(chain(*help_prob))
                    dict_h = dict(zip(H_nugget, help_prob))
                    result.append(dict_h)
            # if Id isn't match than continue until it match or switch to new Id
            else:
                continue
        
        # Submission form
        dict1 = {'nugget':result,'id':Id}
        final.append(dict1)
        
    return final

In [78]:
# final = Generate_submission(dev, lstm_c_model, lstm_h_model)
final = Generate_submission_w2v(dev, lstm_c_model, lstm_h_model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




In [68]:
final[0]

{'nugget': [{'CNUG': 0.04307155683636665,
   'CNUG*': 0.0002533532679080963,
   'CNUG0': 0.2612592279911041,
   'CNaN': 0.6954159140586853},
  {'HNUG': 0.18841741979122162,
   'HNUG*': 0.5632498264312744,
   'HNaN': 0.24833270907402039},
  {'CNUG': 0.3754049837589264,
   'CNUG*': 0.0036986295599490404,
   'CNUG0': 1.2190609595563728e-05,
   'CNaN': 0.6208842396736145},
  {'HNUG': 0.2653927803039551,
   'HNUG*': 0.0014027048600837588,
   'HNaN': 0.7332044839859009}],
 'id': '4227729258237823'}

### 7. Generate the submission estimation JSON

In [79]:
import json
import time
import os

path = 'C:/Users/doudi/OneDrive/Documents/ntcir15/eval'
os.chdir(path)
timestr = time.strftime("%Y%m%d%H%M")

In [80]:
with open((timestr + '_' + 'dev_eval.json'), 'w', encoding='utf-8') as f: 
    f.write(json.dumps(final, ensure_ascii=False, indent=2))

In [81]:
import os
os.chdir('C:\\Users\\doudi\\OneDrive\\Documents\\ntcir15\\eval')
!python eval.py 202007101244_dev_eval.json dev_cn.json

{'quality': None, 'nugget': {'jsd': 4.663028387268745, 'rnss': 2.9577512461942694}}


In [82]:
2**(-4.663028387268745)

0.03947195025302833

In [83]:
2**(-2.9577512461942694)

0.1287147018909632