In [3]:
from collections import Counter
import keras as kr
import numpy as np
import tensorflow as tf
import os

train_dir = 'cnews/cnews.train.txt'
val_dir = 'cnews/cnews.val.txt'
test_dir = 'cnews/cnews.test.txt'
vocab_dir = 'cnews/cnews.vocab.txt'

def read_file(filename):
    #读取文本数据
    contents = []
    labels = []
    with open(filename,mode = 'r',encoding = 'utf-8',errors = 'ignore') as f:
        for line in f:
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    #print(contents)
    return contents,labels

def build_vocab(vocab_size):
    #构建词汇表
    data_train,data_trainlabel = read_file(train_dir)
    data_val,data_vallabel = read_file(val_dir)
    data_test,data_testlabel = read_file(test_dir)
    
    all_data = []
    for content in data_train:
        all_data.extend(content)
        
    for content in data_val:
        all_data.extend(content)
        
    for content in data_test:
        all_data.extend(content)
    
    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size-1)
    words,_ = list(zip(*count_pairs))
    words =['<PAD>'] + list(words)
    open(vocab_dir,mode = 'w',encoding = 'utf-8',errors = 'ignore').write('\n'.join(words)+'\n')
    
#build_vocab()
def read_vocab():
    with open(vocab_dir,mode = 'r',encoding = 'utf-8', errors = 'ignore') as f:
        words = [x.strip() for x in f.readlines()]
        word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

def read_category():
    categories = ['体育','财经','房产','家居','教育','科技','时尚','时政','游戏','娱乐']
    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

def process_file(filename,word_to_id,cat_to_id,max_length):
    #将文件转成id表示
    contents,labels = read_file(filename)
    
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id,max_length)
    y_pad = kr.utils.to_categorical(label_id,num_classes = len(cat_to_id))
    
    return x_pad,y_pad

def batch_iter(x,y,batch_size):
    #生成批次数据
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]
    
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i+1) * batch_size,data_len)
        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

#获取输入数据
if not os.path.exists(vocab_dir):
    build_vocab(5000)

categories,cat_to_id = read_category()
words,word_to_id = read_vocab()
        
#CNN
embedding_dim = 64
seq_length = 600
num_classes = 10
vocab_size = len(words)
kernel_size = 5
num_filters = 128
filter_sizes = [3,4,5]
drop_keep_prob = 0.5
num_epochs = 200

x_train,y_train = process_file(train_dir,word_to_id,cat_to_id,seq_length)
x_val,y_val = process_file(val_dir,word_to_id,cat_to_id,seq_length)

#输入输出
input_x = tf.placeholder(shape = [None,seq_length],dtype = tf.int32,name = 'input_x')
input_y = tf.placeholder(shape = [None,num_classes],dtype = tf.float32,name = 'input_y')
keep_prob = tf.placeholder(tf.float32,name = 'keep_prob')

embedding = tf.Variable(tf.random_uniform([vocab_size,embedding_dim],-1.0,1.0),name='embedding')
embedding_inputs_chars = tf.nn.embedding_lookup(embedding,input_x)
embedding_inputs = tf.expand_dims(embedding_inputs_chars,-1)
    
pooled_outputs = []
for i,filter_size in enumerate(filter_sizes):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        filter_shape = [filter_size,embedding_dim,1,num_filters]
        W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name='W')
        b = tf.Variable(tf.constant(0.1,shape=[num_filters]),name='b')
        conv = tf.nn.conv2d(embedding_inputs,W,strides=[1,1,1,1],padding='VALID',name='conv')
        h = tf.nn.relu(tf.nn.bias_add(conv,b),name='relu')
        pooled = tf.nn.max_pool(h,ksize=[1,seq_length - filter_size + 1,1,1],strides=[1,1,1,1],padding='VALID',name='pool')
        pooled_outputs.append(pooled)
        
#获取全连接层的输入
num_filters_total = num_filters * len(filter_sizes)        
h_pool = tf.concat(pooled_outputs,3)
h_pool_flat = tf.reshape(h_pool,[-1,num_filters_total])

#连接层
with tf.name_scope('dropout'):
    h_drop = tf.nn.dropout(h_pool_flat,drop_keep_prob)
    
with tf.name_scope('output'):
    W = tf.Variable(tf.truncated_normal([num_filters_total,num_classes],stddev=0.1),name='W')
    b = tf.Variable(tf.constant(0.1,shape=[num_classes]),name='b')
    scores = tf.nn.xw_plus_b(h_drop,W,b,name='scores')
    predictions = tf.argmax(scores,1,name='predictions')
    
with tf.name_scope('loss'):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = scores,labels = input_y))
    
with tf.name_scope('accuracy'):
    correct_predictions = tf.equal(predictions,tf.argmax(input_y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions,tf.float32),name='accuracy')

with tf.name_scope('optimize'):
    train_op = tf.train.AdamOptimizer(1e-4).minimize(loss)

#gpu配置信息
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.7)
config = tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=True,log_device_placement=True)    

with tf.Session(config = config) as sess:                     
    tf.global_variables_initializer().run()
    for epoch in range(num_epochs):
        batch_train = batch_iter(x_train,y_train,embedding_dim)
        for x_batch,y_batch in batch_train:
            sess.run(train_op,feed_dict = {input_x:x_batch,input_y:y_batch,keep_prob:drop_keep_prob})
            #losses,acc = sess.run([loss,accuracy],feed_dict = {input_x:x_batch,input_y:y_batch,keep_prob:1.0})
        if epoch % 5 == 0:
            print (epoch)
            losses,acc = sess.run([loss,accuracy],feed_dict = {input_x:x_val,input_y:y_val,keep_prob:1.0})
            print('losses: ' + str(losses) + ' accuracy: ' + str(acc))

0
losses: 5.080528 accuracy: 0.1656
5
losses: 1.3990083 accuracy: 0.6506
10
losses: 0.83316475 accuracy: 0.766
15
losses: 0.6164927 accuracy: 0.814
20
losses: 0.45609835 accuracy: 0.8618
25
losses: 0.39389247 accuracy: 0.8808
30
losses: 0.3220664 accuracy: 0.9006
35
losses: 0.28851238 accuracy: 0.9158
40
losses: 0.26678145 accuracy: 0.9226
45
losses: 0.24778128 accuracy: 0.9252
50
losses: 0.23514155 accuracy: 0.9262
55
losses: 0.24281958 accuracy: 0.9288
60
losses: 0.2364804 accuracy: 0.9314
65
losses: 0.23303199 accuracy: 0.9344
70
losses: 0.24749583 accuracy: 0.93
75
losses: 0.23692307 accuracy: 0.934
80
losses: 0.25376824 accuracy: 0.9318
85
losses: 0.25586948 accuracy: 0.9332
90
losses: 0.23818973 accuracy: 0.9334
95
losses: 0.26053026 accuracy: 0.9314
100
losses: 0.24245317 accuracy: 0.9364
105
losses: 0.24811655 accuracy: 0.9418
110
losses: 0.2506592 accuracy: 0.9376
115
losses: 0.26194772 accuracy: 0.9376
120
losses: 0.25457016 accuracy: 0.9368
125
losses: 0.26645458 accuracy: 0