In [1]:
import numpy as np
import re
import pandas as pd
import string as s

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"[!#$%&'()*+,-./:;<=>?@[\]^_`{|}~]",' ',string)
    return string.strip().lower()

In [3]:
def load_data(filename):
    data=list(open(filename,'r',encoding='utf-8').readlines())
    labels=[]
    x_text=[]
    for t in data:
        t=t.split('+++$+++')
        labels.append(float(t[0].strip()))
        x_text.append(clean_str(t[1]))
    return labels,x_text

In [4]:
labels,x_text=load_data('data/training_label.csv')
print(len(labels),len(x_text))

200000 200000


In [5]:
from nltk.corpus import stopwords
def remove_stopwords(string):
    words=[word for word in string.split(' ') if word not in stopwords.words('english')]
    return " ".join(words)

In [6]:
print(x_text[230])

thanks so much for your donation for chance ur da best     such a hard blow after working so hard to save him


In [7]:
print(remove_stopwords(x_text[230]))

thanks much donation chance ur da best     hard blow working hard save


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
#输出最大的文本有多少词
max_length=max([len(x.split(' ')) for x in x_text])
print(max_length)

190


In [10]:
maxlen=120
validation_samples=5000
training_samples=len(labels)-validation_samples
max_words=20000
embedding_dim=200     #词向量的维度
num_filters=200      #每种卷积的数量，3种卷积，共600个

tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_text)
sequences=tokenizer.texts_to_sequences(x_text)

In [11]:
word_index=tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 81088 unique tokens.


In [12]:
data=pad_sequences(sequences,maxlen=maxlen)
labels=np.array(labels)

In [13]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (200000, 120)
Shape of label tensor: (200000,)


In [14]:
#划分训练集和验证集
x_train=data[:training_samples]
y_train=labels[:training_samples]

x_val=data[training_samples:]
y_val=labels[training_samples:]

In [15]:
print(x_train.shape,y_train.shape,x_val.shape,y_val.shape)

(195000, 120) (195000,) (5000, 120) (5000,)


In [16]:
#模型定义
from keras.models import Model
from keras import layers
from keras import Input

text_input=Input(shape=(None,),dtype='int32',name='text')
embedded_text=layers.Embedding(max_words,embedding_dim,input_length=maxlen)(text_input)
#三种卷积[3,4,5]
conv1_3=layers.Conv1D(num_filters,3,activation='relu')(embedded_text)
conv1_4=layers.Conv1D(num_filters,4,activation='relu')(embedded_text)
conv1_5=layers.Conv1D(num_filters,5,activation='relu')(embedded_text)
#最大池化
maxpool_3=layers.MaxPool1D(maxlen-3+1)(conv1_3)
maxpool_4=layers.MaxPool1D(maxlen-4+1)(conv1_4)
maxpool_5=layers.MaxPool1D(maxlen-5+1)(conv1_5)
#拼接
concatenated=layers.concatenate([maxpool_3,maxpool_4,maxpool_5],axis=-1)
#全连接层
x = layers.Dense(30, activation='relu')(concatenated)
# #dropout
x=layers.Dropout(0.5)(x)
# #平铺
x=layers.Flatten()(x)
#分类
output=layers.Dense(1,activation='sigmoid')(x)
model=Model(text_input,output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 120, 100)     2000000     text[0][0]                       
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 118, 200)     60200       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 117, 200)     80200       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [16]:
#模型定义
from keras.models import Model
from keras import layers
from keras import Input

embedding_dim=200     #词向量的维度
num_filters=200      #每种卷积的数量，3种卷积，共600个
max_words=10000
maxlen=120
text_input=Input(shape=(None,),dtype='int32',name='text')
embedded_text=layers.Embedding(max_words,embedding_dim,input_length=maxlen)(text_input)
x=layers.Bidirectional(layers.LSTM(100,activation='tanh',return_sequences=False, dropout=0.5, recurrent_dropout=0.1))(embedded_text)
x=layers.Dropout(0.5)(x)
x=layers.Dense(30,activation='relu')(x)
output=layers.Dense(1,activation='sigmoid')(x)
model=Model(text_input,output)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 120, 200)          2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               240800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                6030      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 2,246,861
Trainable params: 2,246,861
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [18]:
history=model.fit(x_train,y_train,epochs=3,batch_size=128,validation_data=(x_val,y_val))

Train on 195000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
history=model.fit(x_train,y_train,epochs=3,batch_size=128,validation_data=(x_val,y_val))

Train on 195000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
from keras import backend as K
class ConvInputLayer(layers.Layer):
    """
    Distribute word vectors into chunks - input for the convolution operation
    Input dim: [batch_size x sentence_len x word_vec_dim]
    Output dim: [batch_size x (sentence_len - filter_width + 1) x filter_width x word_vec_dim]
    """
    def __init__(self, filter_width, sent_len, **kwargs):
        super(ConvInputLayer, self).__init__(**kwargs)
        self.filter_width = filter_width
        self.sent_len = sent_len

    def call(self, x):
        chunks = []
        for i in range(self.sent_len - self.filter_width + 1):
            chunk = x[:, i:i+self.filter_width, :]
            chunk = K.expand_dims(chunk, 1)
            chunks.append(chunk)
        return K.concatenate(chunks, 1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.sent_len - self.filter_width + 1, self.filter_width, input_shape[-1])


In [26]:
filter_width=5

#以LSTM为卷积的filter
input_lstm_filter=Input(shape=(None,),dtype='int32',name='text')
embedded_text=layers.Embedding(max_words,embedding_dim,input_length=maxlen)(input_lstm_filter)
x=layers.Dropout(0.5)(embedded_text)
emb_layer=ConvInputLayer(filter_width, maxlen)(x)
conv_layer = layers.TimeDistributed(layers.LSTM(300, dropout=0.4, recurrent_dropout=0.4))(emb_layer)
text_layer = layers.GlobalMaxPooling1D()(conv_layer)
output=layers.Dense(1,activation='sigmoid')(text_layer)
model=Model(input_lstm_filter,output)
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            (None, None)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 120, 200)          2000000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 120, 200)          0         
_________________________________________________________________
conv_input_layer_3 (ConvInpu (None, 116, 5, 200)       0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 116, 300)          601200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 301       
Total para

In [27]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
history=model.fit(x_train,y_train,epochs=3,batch_size=128,validation_data=(x_val,y_val))

Train on 195000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
