# Preprocessing file

In [1]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np
from collections import Counter
from sklearn import preprocessing
import jieba
import re

### 1.JSON to dataframe function

In [2]:
def get_nugget(dataframe):   
    target = {'nugget'}
    dicts = []
    for item in tqdm(dataframe['annotations']):
        sub_dicts = []
        for element in item:
            sub_dicts.append({key:value for key,value in element.items() if key in target}['nugget'])
        dicts.append(sub_dicts)

    dataframe['nuggets'] = dicts # dis is for anno nugget list
    
    return dataframe

def shaping(dataframe):
    length = []
    for i in tqdm(dataframe['turns']):
        length.append(len(i))
    Id = dataframe['id'].tolist()

    Fin_Id = sum([[s] * n for s, n in zip(Id, length)], [])

    turns_list = dataframe['turns'].tolist()
    
    Fin_turns_anno = []
    for x,y in tqdm(zip(turns_list,dataframe['nuggets'])):
        for q in range(len(x)):
            Fin_turns_anno.append(list(x[q].values())+[i[q] for i in y])
    
    return Fin_Id, Fin_turns_anno

def stacking(Fin_Id, Fin_turns_anno):    
    train_clean = pd.DataFrame({'id': Fin_Id,'info': Fin_turns_anno})
    # train_clean.head()
    train_df = pd.DataFrame(train_clean['info'].values.tolist(), columns=['sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19'])
    train_df['id'] = train_clean['id']
    train_df = train_df[['id','sender','utterance','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14','n15','n16','n17','n18','n19']]
    
    return train_df

### 2.Dataset cleaning

In [3]:
def process_data(dataframe):
    
    # id to str
    dataframe['id'] = dataframe['id'].apply(str)
    
    
    # round
    uni = dataframe.id.unique()
    num = []
    for i in uni:
        count = 1
        for j in dataframe['id']:
            if i == j:
                num.append(count)
                count += 1
            else:
                continue

    dataframe['round'] = num
    
    
    # distribution
    Nugget_types = ['CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']
    arr = np.array(dataframe.iloc[:,3:22]) 
    dicts = []
    tmp = []
    
    for i in arr:
        c = Counter(i)
        dicts.append(c)
        
    for i in dicts:
        test = []
        for n in Nugget_types:
            test.append(i.get(n,0)/19)
        tmp.append(test)
        
    tmp = np.array(tmp)
    for i in range(len(Nugget_types)):
        dataframe[Nugget_types[i]] = tmp[:,i]
        
        
    # round_max (round_label)
    f = dataframe.groupby('round').sum()
    out = list(f.idxmax(axis=1))

    round_max = []
    for i in dataframe['round']:
        for j in range(1,8):
            if i == j:
                round_max.append(out[j-1])
            else:
                continue
    dataframe['round_max'] = round_max
    
    
    # label encoding (round_label)
    
    le = preprocessing.LabelEncoder()
    le.fit(Nugget_types);
    round_label = le.transform(list(dataframe['round_max']))
    dataframe['round_label'] = round_label
    
    
    # label encoding (sender_num)
    sender = ['customer','helpdesk']
    l = preprocessing.LabelEncoder()
    l.fit(sender);
    sender_num = l.transform(list(dataframe['sender']))
    dataframe['sender_num'] = sender_num
    
    subset = dataframe[['id','sender','sender_num','utterance','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 3.Segmentation

In [4]:
def segment(dataframe, file_path):
    
    texts = dataframe['utterance'].astype(str)
    
    seg_texts = []
    for line in texts:
        seg_content = ' '.join(jieba.cut(line, cut_all = False))
        seg_texts.append(seg_content)
        
    def remove_punctuation(line):
        rule = re.compile("[^a-zA-Z0-9\u4e00-\u9fa5]")
        line = rule.sub(' ',line)
        return line
    
    texts = []
    for line in seg_texts:
        new_line = remove_punctuation(line).split()
        texts.append(new_line)
        
    cn_stopwords = []
    with open(file_path, 'r', encoding='UTF-8') as file:
        for data in file.read().splitlines():
            cn_stopwords.append(data)
            
    # remove punctuation
    pp_texts = []
    for line in texts:
        line_noSW = []
        for word in line:
            if word not in cn_stopwords:
                line_noSW.append(word)
        pp_texts.append(line_noSW)
    
    # change emoji in pp_texts to *
    for line in pp_texts:
        if line == []:
            line.append("*")
            
    # concatenate the sentences by whitespace
    new_texts = []
    for sentence in pp_texts:
        series_sentence = " ".join(word for word in sentence)
        new_texts.append(series_sentence)
    
    dataframe['texts'] = new_texts
    
    subset = dataframe[['id','sender','sender_num','texts','round','round_max','round_label',
                        'CNUG0', 'CNUG', 'CNUG*', 'CNaN','HNUG', 'HNUG*', 'HNaN']]
    
    return subset

### 4.Combine all function

In [5]:
def generate_dataset(name, wd, stop_word_path):
    os.chdir(wd)
    file = pd.read_json(name, encoding='utf8')
    nu = get_nugget(file)
    Id, anno = shaping(nu)
    output = stacking(Id, anno)
    fin = process_data(output)
    seg = segment(fin, stop_word_path)
    
    return seg

### 5.Generate the result
###### Plz feed the raw_json, working directory and stop_word file in the generate_dataset()

In [7]:
output = generate_dataset(r'train_data_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/stc3-dataset/data',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')
output.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sender,sender_num,texts,round,round_max,round_label,CNUG0,CNUG,CNUG*,CNaN,HNUG,HNUG*,HNaN
0,3830401296796826,customer,0,中国电信 控制箱 没有 维护 信息安全 人身安全 保障 好好 修修 应该 差钱 位于 济南市...,1,CNUG0,2,0.736842,0.052632,0.0,0.210526,0.0,0.0,0.0
1,3830401296796826,helpdesk,1,您好 反映 情况 认真 记录 会 及时 相关 部门 反馈 敬请 等待,2,HNUG,4,0.0,0.0,0.0,0.0,0.473684,0.0,0.526316
2,3830772740080373,customer,0,电信服务 广州市 白云区 太和 镇 谢家 庄 二队 电信 信号 差 投诉 几年 没人 处理 ...,1,CNUG0,2,0.736842,0.0,0.0,0.263158,0.0,0.0,0.0
3,3830772740080373,helpdesk,1,您好 中国电信 广东 客服 关注 反映 问题 请 详细描述 一下 问题 处理 谢谢,2,HNUG,4,0.0,0.0,0.0,0.0,0.526316,0.052632,0.421053
4,3830772740080373,customer,0,图片 知道 起码 通信 信号 没有 更 3G 上网 信号 一年 前 两年 前 第一次 电话 ...,3,CNUG,0,0.052632,0.736842,0.0,0.210526,0.0,0.0,0.0


In [11]:
# save file
# output.to_csv('C:/Users/doudi/OneDrive/Documents/stc3-dataset/data/train_data_cn.csv', 
#               index=False, encoding='utf_8_sig')

In [8]:
output_15_tr = generate_dataset(r'train_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')
output_15_tr.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=3700.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sender,sender_num,texts,round,round_max,round_label,CNUG0,CNUG,CNUG*,CNaN,HNUG,HNUG*,HNaN
0,3830401296796826,customer,0,中国电信 控制箱 没有 维护 信息安全 人身安全 保障 好好 修修 应该 差钱 位于 济南市...,1,CNUG0,2,0.736842,0.052632,0.0,0.210526,0.0,0.0,0.0
1,3830401296796826,helpdesk,1,您好 反映 情况 认真 记录 会 及时 相关 部门 反馈 敬请 等待,2,HNUG,4,0.0,0.0,0.0,0.0,0.473684,0.0,0.526316
2,3830772740080373,customer,0,电信服务 广州市 白云区 太和 镇 谢家 庄 二队 电信 信号 差 投诉 几年 没人 处理 ...,1,CNUG0,2,0.736842,0.0,0.0,0.263158,0.0,0.0,0.0
3,3830772740080373,helpdesk,1,您好 中国电信 广东 客服 关注 反映 问题 请 详细描述 一下 问题 处理 谢谢,2,HNUG,4,0.0,0.0,0.0,0.0,0.526316,0.052632,0.421053
4,3830772740080373,customer,0,图片 知道 起码 通信 信号 没有 更 3G 上网 信号 一年 前 两年 前 第一次 电话 ...,3,CNUG,0,0.052632,0.736842,0.0,0.210526,0.0,0.0,0.0


In [9]:
output_15_te = generate_dataset(r'dev_cn.json',
                          'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/New_DialEval-1',
                         'C:/Users/doudi/OneDrive/Documents/ntcir15/Dataset/DialEval-1/cn_stopwords.txt')
output_15_te.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=390.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,sender,sender_num,texts,round,round_max,round_label,CNUG0,CNUG,CNUG*,CNaN,HNUG,HNUG*,HNaN
0,4227729258237823,customer,0,内涵 段子 联通 皮 点赞 中国联通 中国联通 客服 掌上 营业厅 内涵 段子 话题 封 郑...,1,CNUG0,2,0.157895,0.052632,0.0,0.789474,0.0,0.0,0.0
1,4227729258237823,helpdesk,1,u,2,HNUG,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105
2,4227729258237823,customer,0,夸夸,3,CNUG,0,0.0,0.157895,0.0,0.842105,0.0,0.0,0.0
3,4227729258237823,helpdesk,1,*,4,HNUG,4,0.0,0.0,0.0,0.0,0.157895,0.0,0.842105
4,4121001149457182,customer,0,距离 反映 问题 已经 一个 星期 花粉 助手 D 荣耀 honor 荣耀 手机 华为 终端...,1,CNUG0,2,0.789474,0.052632,0.0,0.157895,0.0,0.0,0.0


> See if the file are same 

In [25]:
# count = 0
# if output.equals(output_15_tr) == True:
#     print("DialEval-14 training data is as same as DialEval-15 training data")
# else:
#     print("DialEval-14 training data is not as same as DialEval-15 training data")

    
# output.iloc[0] == output_15_tr.iloc[0]

# # print("There are {0} rows in 2 datasets as same".format(count))
# # print("\n")
# # print("There are {0} rows in 2 datasets as different".format((len(output)-count)))

DialEval-14 training data is as same as DialEval-15 training data


id             True
sender         True
sender_num     True
texts          True
round          True
round_max      True
round_label    True
CNUG0          True
CNUG           True
CNUG*          True
CNaN           True
HNUG           True
HNUG*          True
HNaN           True
Name: 0, dtype: bool