In [12]:
import numpy as np
import pandas as pd
import collections
import jieba
import keras

In [13]:
# train_data_url = "https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/"
# test_data_url = "https://worksheets.codalab.org/rest/bundles/0x1f96bc12222641209ad057e762910252/contents/blob/"

In [13]:
def get_json_data(path):
    # read data
    data_df = pd.read_json(path)
    # change row and colunm
    data_df = data_df.transpose()
    # change colunm order
    data_df = data_df[['query', 'label']]
    return data_df

In [14]:
train_data_df = get_json_data(path="train.json")
test_data_df = get_json_data(path="dev.json")

train_data_df.head()

Unnamed: 0,query,label
0,今天东莞天气如何,weather
1,从观音桥到重庆市图书馆怎么走,map
2,鸭蛋怎么腌？,cookbook
3,怎么治疗牛皮癣,health
4,唠什么,chat


In [15]:
test_data_df.head()

Unnamed: 0,query,label
0,毛泽东的诗哦。,poetry
1,有房有车吗微笑,chat
2,2013年亚洲冠军联赛恒广州恒大比赛时间。,match
3,若相惜不弃下一句是什么？,poetry
4,苹果翻译成英语,translation


In [16]:
train_data_df.describe()

Unnamed: 0,query,label
count,2299,2299
unique,2299,31
top,翻译慷慨激昂,chat
freq,1,455


In [17]:
test_data_df.describe()

Unnamed: 0,query,label
count,770,770
unique,770,31
top,帮我链接到新浪网,chat
freq,1,154


In [21]:
# 获取所有标签，也就是分类的类别
# lables = list(set(train_data_df['label'].tolist()))
# 所有标签
labels = ['website', 'tvchannel', 'lottery', 'chat', 'match',
          'datetime', 'weather', 'bus', 'novel', 'video', 'riddle',
          'calc', 'telephone', 'health', 'contacts', 'epg', 'app', 'music',
          'cookbook', 'stock', 'map', 'message', 'poetry', 'cinemas', 'news',
          'flight', 'translation', 'train', 'schedule', 'radio', 'email']

label_numbers = len(labels)
label_numbers

31

In [None]:
# 标签和对应ID的映射字典
label_index_dict = dict([(label, index) for index, label in enumerate(labels)])
index_label_dict = dict([(index, label) for index, label in enumerate(labels)])

In [23]:
# 结巴分词 对元数据进行处理

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认精确模式
list(seg_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\10926\AppData\Local\Temp\jieba.cache
Loading model cost 0.820 seconds.
Prefix dict has been built successfully.


['他', '来到', '了', '网易', '杭研', '大厦']

In [26]:
# 序列化

def use_jieba_cut(one_sentence):
    return list(jieba.cut(one_sentence))

train_data_df['cut_query'] = train_data_df['query'].apply(use_jieba_cut)
test_data_df['cut_query'] = test_data_df['query'].apply(use_jieba_cut)

train_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,今天东莞天气如何,weather,"[今天, 东莞, 天气, 如何]"
1,从观音桥到重庆市图书馆怎么走,map,"[从, 观音桥, 到, 重庆市, 图书馆, 怎么, 走]"
2,鸭蛋怎么腌？,cookbook,"[鸭蛋, 怎么, 腌, ？]"
3,怎么治疗牛皮癣,health,"[怎么, 治疗, 牛皮癣]"
4,唠什么,chat,"[唠, 什么]"
5,阳澄湖大闸蟹的做法。,cookbook,"[阳澄湖, 大闸蟹, 的, 做法, 。]"
6,昆山大润发在哪里,map,"[昆山, 大润发, 在, 哪里]"
7,红烧肉怎么做？嗯？,cookbook,"[红烧肉, 怎么, 做, ？, 嗯, ？]"
8,南京到厦门的火车票,train,"[南京, 到, 厦门, 的, 火车票]"
9,6的平方,calc,"[6, 的, 平方]"


In [27]:
test_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,毛泽东的诗哦。,poetry,"[毛泽东, 的, 诗, 哦, 。]"
1,有房有车吗微笑,chat,"[有房, 有车, 吗, 微笑]"
2,2013年亚洲冠军联赛恒广州恒大比赛时间。,match,"[2013, 年, 亚洲, 冠军联赛, 恒, 广州, 恒大, 比赛, 时间, 。]"
3,若相惜不弃下一句是什么？,poetry,"[若, 相惜, 不弃, 下, 一句, 是, 什么, ？]"
4,苹果翻译成英语,translation,"[苹果, 翻译成, 英语]"
5,翻译光大银行,translation,"[翻译, 光大银行]"
6,哪天的？,chat,"[哪天, 的, ？]"
7,无锡到阜阳怎么坐汽车？,bus,"[无锡, 到, 阜阳, 怎么, 坐, 汽车, ？]"
8,孜然排骨怎么做？,cookbook,"[孜然, 排骨, 怎么, 做, ？]"
9,娃咋么杨你因为呵呵演完呀不到你,chat,"[娃, 咋, 么, 杨, 你, 因为, 呵呵, 演完, 呀, 不到, 你]"


In [31]:
# 获取数据的所有词汇

def get_all_vocabulary(data, colunm_name):
    train_vocabulary_list = []
    max_cut_query_lenth = 0
    
    for cut_query in data[colunm_name]:
        if len(cut_query) > max_cut_query_lenth:
            max_cut_query_lenth = len(cut_query)
        train_vocabulary_list += cut_query
    return train_vocabulary_list, max_cut_query_lenth

In [32]:
train_vocabulary_list, max_cut_query = get_all_vocabulary(train_data_df, 'cut_query')
print('Number of words:',len(train_vocabulary_list))

Number of words: 11498
