In [93]:
import numpy as np
import pandas as pd
import collections
import pickle
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences  #序列预处理 序列填充
from keras.utils import to_categorical,plot_model   # 将类别向量转换为二进制（只有0和1）的矩阵类型表示
from keras.models import Sequential   # 序贯模型是函数式模型的简略版，为最简单的线性、从头到尾的结构顺序，不分叉，是多个网络层的线性堆叠
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import TensorBoard, Callback

from keras import backend as K

import time
from sklearn.metrics import classification_report

In [94]:
def get_json_data(path):
    # 读取数据
    data_df = pd.read_json(path)
    # 转置
    data_df = data_df.transpose()
    # 改名称
    data_df = data_df[['query', 'label']]
    return data_df

train_data_df = get_json_data(path="train.json")
test_data_df = get_json_data(path="dev.json")

train_data_df.head()

Unnamed: 0,query,label
0,今天东莞天气如何,weather
1,从观音桥到重庆市图书馆怎么走,map
2,鸭蛋怎么腌？,cookbook
3,怎么治疗牛皮癣,health
4,唠什么,chat


In [95]:
# 结巴分词 对元数据进行处理

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认精确模式
list(seg_list)

['他', '来到', '了', '网易', '杭研', '大厦']

In [96]:
# 序列化

def use_jieba_cut(one_sentence):
    return list(jieba.cut(one_sentence))

train_data_df['cut_query'] = train_data_df['query'].apply(use_jieba_cut)
test_data_df['cut_query'] = test_data_df['query'].apply(use_jieba_cut)

train_data_df.head(10)


Unnamed: 0,query,label,cut_query
0,今天东莞天气如何,weather,"[今天, 东莞, 天气, 如何]"
1,从观音桥到重庆市图书馆怎么走,map,"[从, 观音桥, 到, 重庆市, 图书馆, 怎么, 走]"
2,鸭蛋怎么腌？,cookbook,"[鸭蛋, 怎么, 腌, ？]"
3,怎么治疗牛皮癣,health,"[怎么, 治疗, 牛皮癣]"
4,唠什么,chat,"[唠, 什么]"
5,阳澄湖大闸蟹的做法。,cookbook,"[阳澄湖, 大闸蟹, 的, 做法, 。]"
6,昆山大润发在哪里,map,"[昆山, 大润发, 在, 哪里]"
7,红烧肉怎么做？嗯？,cookbook,"[红烧肉, 怎么, 做, ？, 嗯, ？]"
8,南京到厦门的火车票,train,"[南京, 到, 厦门, 的, 火车票]"
9,6的平方,calc,"[6, 的, 平方]"


In [97]:
# 处理特征

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_df['cut_query'])
max_features = len(tokenizer.index_word)
len(tokenizer.index_word)

2883

In [98]:
x_train = tokenizer.texts_to_sequences(train_data_df['cut_query'])
x_test = tokenizer.texts_to_sequences(test_data_df['cut_query'])
max_cut_query_lenth = 26
x_train = pad_sequences(x_train, max_cut_query_lenth)
x_test = pad_sequences(x_test, max_cut_query_lenth)

In [99]:
# 处理标签

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(train_data_df['label'])
label_numbers = len(label_tokenizer.word_counts)
NUM_CLASSES = len(label_tokenizer.word_counts)
label_tokenizer.word_counts

OrderedDict([('weather', 66),
             ('map', 68),
             ('cookbook', 269),
             ('health', 55),
             ('chat', 455),
             ('train', 70),
             ('calc', 24),
             ('translation', 61),
             ('music', 66),
             ('tvchannel', 71),
             ('poetry', 102),
             ('telephone', 63),
             ('stock', 71),
             ('radio', 24),
             ('contacts', 30),
             ('lottery', 24),
             ('website', 54),
             ('video', 182),
             ('news', 58),
             ('bus', 24),
             ('app', 53),
             ('flight', 62),
             ('epg', 107),
             ('message', 63),
             ('match', 24),
             ('schedule', 29),
             ('novel', 24),
             ('riddle', 34),
             ('email', 24),
             ('datetime', 18),
             ('cinemas', 24)])

In [100]:
y_train = label_tokenizer.texts_to_sequences(train_data_df['label'])
y_train[:10]

[[10], [9], [2], [17], [1], [2], [9], [2], [8], [23]]

In [101]:
y_train = [[y[0]-1] for y in y_train]
y_train[:10]

[[9], [8], [1], [16], [0], [1], [8], [1], [7], [22]]

In [102]:
y_train = to_categorical(y_train, label_numbers)
y_train.shape

(2299, 31)

In [103]:
y_test = label_tokenizer.texts_to_sequences(test_data_df['label'])
y_test = [y[0]-1 for y in y_test]
y_test = to_categorical(y_test, label_numbers)
y_test.shape

(770, 31)

In [104]:
y_test[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [105]:
# 设计模型

# 计算 F1 值的函数
def f1(y_true, y_pred):
    def recall(y_true, y_pred):       # 召回指标：仅计算召回的批量平均。计算召回率，这是一种多标签分类的指标选择了多少个相关项目
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):        # 精确度指标:仅计算精度的批量平均值。，这是用于多标签分类的指标有多少个相关的选定项目。
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision+recall + K.epsilon()))

# 设计模型

def create_lstm_model(max_features, max_cut_query_lenth, label_numbers):
    model = Sequential()
    model.add(Embedding(input_dim=max_features+1, output_dim=32, input_length=max_cut_query_lenth))
    model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(label_numbers, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[f1])

    plot_model(model, to_file='lstm_model_2.png', show_shapes=True)
    
    return model

In [106]:
# 获取自定义时间格式的字符串
def get_customization_time():
    # return '2020_02_20_20_20_20' 年月日时分秒
    time_tuple = time.localtime(time.time())
    customization_time = "{}_{}_{}_{}_{}_{}".format(time_tuple[0], time_tuple[1], time_tuple[2], time_tuple[3], time_tuple[4], time_tuple[5])
    return customization_time

# 训练模型

if 'max_features'  not in  dir():
    max_features = 2888
    print('Not find max_features variable, use default max_features values:\t{}'.format(max_features))
if 'max_cut_query_lenth'  not in  dir():
    max_cut_query_lenth = 26
    print('Not find max_cut_query_lenth, use default max_features values:\t{}'.format(max_cut_query_lenth))
if 'label_numbers'  not in  dir():
    label_numbers = 31
    print('Not find label_numbers, use default max_features values:\t{}'.format(label_numbers))
    
model = create_lstm_model(max_features, max_cut_query_lenth, label_numbers)

batch_size = 20  # 批次大小
epochs = 30   # 周期 

print('Train...')
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs)

Train...
Train...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x14275302e88>

In [107]:
json_string = model.to_json()
with open("model_json_2.pkl", 'wb') as fo:     # 将数据写入pkl文件
    pickle.dump(json_string, fo)
    
    
model.save_weights('my_model_2.h5')

In [108]:
# 模型评估

loss, accuracy = model.evaluate(x_test, y_test,
                                batch_size=batch_size, verbose=1)

print('Test loss：', loss)
print('Accuracy:', accuracy)

Test loss： 0.8105083657549573
Accuracy: 0.846634566783905


In [109]:
# 预测
y_pred_test = model.predict(x_test)
# print(y_pred_test.shape)

# One-hot

y_true = np.argmax(y_test, axis=1).tolist()
y_pred = np.argmax(y_pred_test, axis=1).tolist()

#查看分类的 准确率、召回率、F1值  

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.91      0.83       154
           1       0.95      0.93      0.94        89
           2       0.60      0.63      0.62        60
           3       0.85      0.81      0.83        36
           4       0.85      1.00      0.92        34
           5       0.80      0.70      0.74        23
           6       0.91      0.83      0.87        24
           7       0.89      1.00      0.94        24
           8       0.81      0.57      0.67        23
           9       0.95      0.91      0.93        22
          10       0.93      0.59      0.72        22
          11       0.83      0.95      0.89        21
          12       1.00      0.90      0.95        21
          13       0.95      0.95      0.95        21
          14       1.00      0.95      0.98        21
          15       0.86      0.95      0.90        20
          16       0.79      0.58      0.67        19
          17       0.65    