In [273]:
import numpy as np
import pandas as pd
import collections
import jieba
from keras.preprocessing.sequence import pad_sequences  #序列预处理 序列填充
from keras.utils import to_categorical,plot_model   # 将类别向量转换为二进制（只有0和1）的矩阵类型表示
from keras.models import Sequential   # 序贯模型是函数式模型的简略版，为最简单的线性、从头到尾的结构顺序，不分叉，是多个网络层的线性堆叠
from keras.layers import Embedding, LSTM, Dense
from keras import backend as K
from keras.callbacks import TensorBoard, Callback   # TensorBoard是一个可视化工具，它可以用来展示网络图、张量的指标变化、张量的分布情况等
import time
from sklearn.metrics import classification_report


In [274]:
# train_data_url = "https://worksheets.codalab.org/rest/bundles/0x0161fd2fb40d4dd48541c2643d04b0b8/contents/blob/"
# test_data_url = "https://worksheets.codalab.org/rest/bundles/0x1f96bc12222641209ad057e762910252/contents/blob/"

In [275]:
def get_json_data(path):
    # read data
    data_df = pd.read_json(path)
    # change row and colunm
    data_df = data_df.transpose()
    # change colunm order
    data_df = data_df[['query', 'label']]
    return data_df

In [276]:
train_data_df = get_json_data(path="train.json")
test_data_df = get_json_data(path="dev.json")

train_data_df.head()

Unnamed: 0,query,label
0,今天东莞天气如何,weather
1,从观音桥到重庆市图书馆怎么走,map
2,鸭蛋怎么腌？,cookbook
3,怎么治疗牛皮癣,health
4,唠什么,chat


In [277]:
test_data_df.head()

Unnamed: 0,query,label
0,毛泽东的诗哦。,poetry
1,有房有车吗微笑,chat
2,2013年亚洲冠军联赛恒广州恒大比赛时间。,match
3,若相惜不弃下一句是什么？,poetry
4,苹果翻译成英语,translation


In [278]:
train_data_df.describe()

Unnamed: 0,query,label
count,2299,2299
unique,2299,31
top,翻译慷慨激昂,chat
freq,1,455


In [279]:
test_data_df.describe()

Unnamed: 0,query,label
count,770,770
unique,770,31
top,帮我链接到新浪网,chat
freq,1,154


In [280]:
# 获取所有标签，也就是分类的类别
# lables = list(set(train_data_df['label'].tolist()))
# 所有标签
labels = ['website', 'tvchannel', 'lottery', 'chat', 'match',
          'datetime', 'weather', 'bus', 'novel', 'video', 'riddle',
          'calc', 'telephone', 'health', 'contacts', 'epg', 'app', 'music',
          'cookbook', 'stock', 'map', 'message', 'poetry', 'cinemas', 'news',
          'flight', 'translation', 'train', 'schedule', 'radio', 'email']

label_numbers = len(labels)
label_numbers

31

In [281]:
# 标签和对应ID的映射字典
label_index_dict = dict([(label, index) for index, label in enumerate(labels)])
index_label_dict = dict([(index, label) for index, label in enumerate(labels)])

In [282]:
# 结巴分词 对元数据进行处理

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认精确模式
list(seg_list)

['他', '来到', '了', '网易', '杭研', '大厦']

In [283]:
# 序列化

def use_jieba_cut(one_sentence):
    return list(jieba.cut(one_sentence))

train_data_df['cut_query'] = train_data_df['query'].apply(use_jieba_cut)
test_data_df['cut_query'] = test_data_df['query'].apply(use_jieba_cut)

train_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,今天东莞天气如何,weather,"[今天, 东莞, 天气, 如何]"
1,从观音桥到重庆市图书馆怎么走,map,"[从, 观音桥, 到, 重庆市, 图书馆, 怎么, 走]"
2,鸭蛋怎么腌？,cookbook,"[鸭蛋, 怎么, 腌, ？]"
3,怎么治疗牛皮癣,health,"[怎么, 治疗, 牛皮癣]"
4,唠什么,chat,"[唠, 什么]"
5,阳澄湖大闸蟹的做法。,cookbook,"[阳澄湖, 大闸蟹, 的, 做法, 。]"
6,昆山大润发在哪里,map,"[昆山, 大润发, 在, 哪里]"
7,红烧肉怎么做？嗯？,cookbook,"[红烧肉, 怎么, 做, ？, 嗯, ？]"
8,南京到厦门的火车票,train,"[南京, 到, 厦门, 的, 火车票]"
9,6的平方,calc,"[6, 的, 平方]"


In [284]:
test_data_df.head(10)

Unnamed: 0,query,label,cut_query
0,毛泽东的诗哦。,poetry,"[毛泽东, 的, 诗, 哦, 。]"
1,有房有车吗微笑,chat,"[有房, 有车, 吗, 微笑]"
2,2013年亚洲冠军联赛恒广州恒大比赛时间。,match,"[2013, 年, 亚洲, 冠军联赛, 恒, 广州, 恒大, 比赛, 时间, 。]"
3,若相惜不弃下一句是什么？,poetry,"[若, 相惜, 不弃, 下, 一句, 是, 什么, ？]"
4,苹果翻译成英语,translation,"[苹果, 翻译成, 英语]"
5,翻译光大银行,translation,"[翻译, 光大银行]"
6,哪天的？,chat,"[哪天, 的, ？]"
7,无锡到阜阳怎么坐汽车？,bus,"[无锡, 到, 阜阳, 怎么, 坐, 汽车, ？]"
8,孜然排骨怎么做？,cookbook,"[孜然, 排骨, 怎么, 做, ？]"
9,娃咋么杨你因为呵呵演完呀不到你,chat,"[娃, 咋, 么, 杨, 你, 因为, 呵呵, 演完, 呀, 不到, 你]"


In [285]:
# 获取数据的所有词汇

def get_all_vocabulary(data, colunm_name):
    train_vocabulary_list = []
    max_cut_query_lenth = 0
    
    for cut_query in data[colunm_name]:
        if len(cut_query) > max_cut_query_lenth:
            max_cut_query_lenth = len(cut_query)
        train_vocabulary_list += cut_query
    return train_vocabulary_list, max_cut_query_lenth

train_vocabulary_list, max_cut_query_lenth = get_all_vocabulary(train_data_df, 'cut_query')
print('Number of words:', len(train_vocabulary_list))

Number of words: 11498


In [286]:
print('Max_cut_query_lenth:', max_cut_query_lenth)

Max_cut_query_lenth: 26


In [287]:
test_vocabulary_list, test_max_cut_query_lenth = get_all_vocabulary(test_data_df, 'cut_query')
print('Test_max_cut_query_lenth:', test_max_cut_query_lenth)

Test_max_cut_query_lenth: 46


In [288]:
train_vocabulary_list[:10]

['今天', '东莞', '天气', '如何', '从', '观音桥', '到', '重庆市', '图书馆', '怎么']

In [289]:
train_vocabulary_counter = collections.Counter(train_vocabulary_list)
print('Number of different words:', len(train_vocabulary_counter.keys()))

Number of different words: 2887


In [290]:
# 不同种类的词汇个数，预留一个位置给不存在的词汇（不存在的词汇标记为0）

max_features = len(train_vocabulary_counter.keys()) + 1
max_features

2888

In [291]:
# 频率最高的10个字
train_vocabulary_counter.most_common(10)

[('的', 605),
 ('。', 341),
 ('我', 320),
 ('你', 297),
 ('怎么', 273),
 ('？', 251),
 ('什么', 210),
 ('到', 165),
 ('给', 154),
 ('做', 148)]

In [292]:
# 统计低频词语
words_times_zero = 0
for word, words_times in train_vocabulary_counter.items():
    if words_times <= 1:
        words_times_zero += 1
print('Word_times_zero:', words_times_zero)
print('Wors_times_zero/all:', words_times_zero / len(train_vocabulary_counter))

Word_times_zero: 1978
Wors_times_zero/all: 0.685140284031867


In [293]:
# 制作词汇字典

def create_train_vocabulary_dict(train_vocabulary_counter):
    word_index, index_word = {}, {}
    index_number = 1
    for word, words_time in train_vocabulary_counter.most_common():
        word_index[word] = index_number
        index_word[index_number] = word
        index_number += 1
    return word_index, index_word

word_index_dict, index_word_dict = create_train_vocabulary_dict(train_vocabulary_counter)

print(word_index_dict['我'], word_index_dict['。'])
        

3 2


In [294]:
print(index_word_dict[55], index_word_dict[1])

播放 的


In [295]:
pq = 0
for index, row in train_data_df.iteritems():
    print(row[0], row[1], row[2])
    pq += 1
    if pq == 10:
        break

今天东莞天气如何 从观音桥到重庆市图书馆怎么走 鸭蛋怎么腌？
weather map cookbook
['今天', '东莞', '天气', '如何'] ['从', '观音桥', '到', '重庆市', '图书馆', '怎么', '走'] ['鸭蛋', '怎么', '腌', '？']


In [296]:
word_index_dict.get('我们', 0)

371

In [297]:
# 向量化数据
def vectorize_data(data, label_index_dict, word_index_dict, max_cut_query_lenth):
    x_train = []
    y_train = []
    for index, row in data.iterrows():
        query_sentence = row[2]
        label = row[1]
        # 字典找不到的情况下用 0 填充
        x = [word_index_dict.get(w, 0) for w in query_sentence]
        y = [label_index_dict[label]]
        x_train.append(x)
        y_train.append(y)
    return (pad_sequences(x_train, maxlen=max_cut_query_lenth), pad_sequences(y_train, maxlen=1))

x_train, y_train = vectorize_data(train_data_df, label_index_dict, word_index_dict, max_cut_query_lenth)
x_test, y_test = vectorize_data(test_data_df, label_index_dict, word_index_dict, test_max_cut_query_lenth)
print(x_train[0], y_train[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0  33 318  27  90] [6]


In [298]:
y_train = to_categorical(y_train, label_numbers)
y_test = to_categorical(y_test, label_numbers)
print(x_train.shape, y_train.shape)


(2299, 26) (2299, 31)


In [299]:
print(x_test.shape, y_test.shape)

(770, 46) (770, 31)


In [300]:
# 存储预处理过的数据
print(type(x_test))
np.savez("preprocessed_data", x_train, y_train, x_test, y_test)

<class 'numpy.ndarray'>


In [301]:
# 直接加载预处理的数据

use_preprocessed_data = True

if use_preprocessed_data == True:
    preprocessed_data = np.load('preprocessed_data.npz')
    x_train, y_train, x_test, y_test = preprocessed_data['arr_0'], preprocessed_data['arr_1'], preprocessed_data['arr_2'], preprocessed_data['arr_3'],

print(x_train.shape, y_train.shape)

(2299, 26) (2299, 31)


In [302]:
# 计算 F1 值的函数
def f1(y_true, y_pred):
    def recall(y_true, y_pred):       # 召回指标：仅计算召回的批量平均。计算召回率，这是一种多标签分类的指标选择了多少个相关项目
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):        # 精确度指标:仅计算精度的批量平均值。，这是用于多标签分类的指标有多少个相关的选定项目。
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision+recall + K.epsilon()))

# 设计模型

def creat_lstm_model(max_features, max_cut_query_lenth, label_numbers):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=32, input_length=max_cut_query_lenth))
    model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(label_numbers, activation='softmax'))
    # 尝试使用不同的优化器和不同的优化器配置
    model.compile(loss='categorical_crossentropy',  # categorical_crossentropy：亦称作多类的对数损失，注意使用该目标函数时，需要将标签转化为形如(nb_samples, nb_classes)的二值序列
                  optimizer='adam',
                  metrics=[f1])   # 指标
    plot_model(model, to_file='LSTM_model.png', show_shapes=True)
    
    return model

In [303]:
# 获取自定义时间格式的字符串
def get_customization_time():
    # return '2020_02_20_20_20_20' 年月日时分秒
    time_tuple = time.localtime(time.time())
    customization_time = "{}_{}_{}_{}_{}_{}".format(time_tuple[0], time_tuple[1], time_tuple[2], time_tuple[3], time_tuple[4], time_tuple[5])
    return customization_time

# 训练模型

if 'max_features'  not in  dir():
    max_features = 2888
    print('Not find max_features variable, use default max_features values:\t{}'.format(max_features))
if 'max_cut_query_lenth'  not in  dir():
    max_cut_query_lenth = 26
    print('Not find max_cut_query_lenth, use default max_features values:\t{}'.format(max_cut_query_lenth))
if 'label_numbers'  not in  dir():
    label_numbers = 31
    print('Not find label_numbers, use default max_features values:\t{}'.format(label_numbers))
    
model = creat_lstm_model(max_features, max_cut_query_lenth, label_numbers)
batch_size = 20  # 批次大小
epochs = 30   # 周期 

print('Train...')
model.fit(x_train, y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          callbacks=[TensorBoard(log_dir='../logs/{}'.format("lstm_{}".format(get_customization_time())))],
          validation_split=0.2)

Train...
Train on 1839 samples, validate on 460 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x14618139308>

In [305]:
# 评估模型

#score = model.evaluate(x_test, y_test,
                        #batch_size=batch_size, verbose=1)

#print('Test score:', score[0])
#print('Test f1:', score[1])



ValueError: Error when checking input: expected embedding_5_input to have shape (26,) but got array with shape (46,)

In [306]:
# 将 one-hot 张量转换成对应的整数

y_true = np.argmax(y_test, axis=1).tolist()  # 目标值
y_pred = np.argmax(model.predict(x_test), axis=1).tolist()   # 估计值

#查看多分类的 准确率、召回率、F1 值
print(classification_report(y_true, y_pred))


ValueError: Error when checking input: expected embedding_5_input to have shape (26,) but got array with shape (46,)