## CNN用于文本分类


![原理图](textcnn.png)
上图中，输入为表示为词表为d=5，长度为7的矩阵的句子，1D卷积核为长度分别为(2,3,4)的各两个（宽度为d），经过卷积并激活函数后，各自产生了(4x1, 5x1, 6x1)的各两个feature map，每个feature map经过一次1D max pooling后（即取每个feature map的最大值）再concatenate为一个6x1的1D向量，经过一个全连接层再softmax激活即可进行情感分类预测。



- 先把原始的文本处理成2000维的向量，太长的截断，不够的补0
- 生成300维的嵌入
- CNN，长度为3，4，5的卷积核各256个，宽度为300，各自1998*1，1997*1，1996*1的各256个feature map
- 每个feature map经过1D max pooling以后，再concate 成一个768 *1的1D向量， flatten，输入给softmax
- 输出分类的one hot编码


- 结论
    - 有多少个卷积核，第二层的输出维度就是多少
    - concate以后的向量维度，与卷积核数量一致

In [1]:
256 * 3

768

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding,Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.layers.merge import concatenate
from keras.models import Sequential, Model



### 原始输入
# ~/Downloads/train_set.csv
# ~/workspace/sublime/daguan/train_sample.csv

train_path =  '~/workspace/sublime/daguan/train_sample.csv'
test_path = '~/workspace/sublime/daguan/train_sample.csv'

doc_len = 2000
embedding_dim = 300


print('read data')
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train.drop(columns=['article','id'], inplace=True)
df_test.drop(columns=['article'], inplace=True)


word_seg = df_train['word_seg']
label = df_train['class'] - 1
X_train, X_test, y_train, y_test = train_test_split(word_seg, label, test_size=0.1, random_state=42)

### label编码
y_labels = list(y_train.value_counts().index)
le = preprocessing.LabelEncoder()
le.fit(y_labels)
num_labels = len(y_labels)
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), num_labels)
y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), num_labels)


tokenizer = Tokenizer(split=' ')
tokenizer.fit_on_texts(word_seg)
vocab = tokenizer.word_index

# pad是填充，意思是在前面补零，处理完后长度均为200
# 输入转换

x_train_word_ids = tokenizer.texts_to_sequences(X_train)
x_test_word_ids = tokenizer.texts_to_sequences(X_test)


x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=doc_len)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=doc_len)




### TextCNN模型
main_input = Input(shape=(doc_len,), dtype='float64')

embedder = Embedding(len(vocab) + 1, embedding_dim, input_length = doc_len)
embed = embedder(main_input)


cnn1 = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn1 = MaxPool1D(pool_size=4)(cnn1)
cnn2 = Convolution1D(256, 4, padding='same', strides = 1, activation='relu')(embed)
cnn2 = MaxPool1D(pool_size=4)(cnn2)
cnn3 = Convolution1D(256, 5, padding='same', strides = 1, activation='relu')(embed)
cnn3 = MaxPool1D(pool_size=4)(cnn3)

cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)


flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)

main_output = Dense(num_labels, activation='softmax')(drop)

model = Model(inputs = main_input, outputs = main_output)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train_padded_seqs, y_train,
          batch_size=32,
          epochs=12,
          validation_data=(x_test_padded_seqs, y_test))

#model.save('textcnn.h5')
print(model.summary())

# 评价
score = model.evaluate(x_test_padded_seqs, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])	



## 特征转换
xx_test_word_ids = tokenizer.texts_to_sequences(df_test['word_seg'])
xx_test_padded_seqs = pad_sequences(xx_test_word_ids, maxlen=doc_len)

## 预测
pred_prob = model.predict(xx_test_padded_seqs)
pred = pred_prob.argmax(axis=1)


## 结果保存
df_test['class'] = pred.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id','class']]
df_result.to_csv('./textcnn.csv',index=False)


In [5]:
import numpy as np
import cv2
import keras.backend as K
import tensorflow as tf


a = K.variable(np.array([[1 , 2, 3]]))
b = K.variable(np.array([[3 , 2, 1]]))
c1 = K.concatenate([a , b] , axis=0)
c2 = K.concatenate([a , b] , axis=1)
c3 = K.concatenate([a , b])
 
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    print(sess.run(c1))
    print(sess.run(c2))
    print(sess.run(c3))

[[1. 2. 3.]
 [3. 2. 1.]]
[[1. 2. 3. 3. 2. 1.]]
[[1. 2. 3. 3. 2. 1.]]
