In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
from keras.layers import BatchNormalization
from keras.layers.merge import concatenate

Using TensorFlow backend.


In [2]:
df = pd.read_csv('./dataset/PosNeg.csv')
df.head()

Unnamed: 0,Question,Label
0,who was the american general in the pacific du...,Positive
1,what years the steelers won the super bowl,Positive
2,what was the name of the first label elvis rec...,Positive
3,what was the first space shuttle to fly,Positive
4,who was the first governor in connecticut,Positive


In [3]:
title = df.Question
label = df.Label
X_train, X_test, y_train, y_test = train_test_split(title, label, test_size=0.1, random_state=42)

In [4]:
# MultinomialNB Classifier
vect = TfidfVectorizer(stop_words='english', token_pattern=r'\b\w{2,}\b', min_df=1, max_df=0.1, ngram_range=(1,2))                          # r: Raw String 字符串不会转义
mnb = MultinomialNB(alpha=2)              # alpha 平滑参数
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)         #random_state参数的作用是为了保证每次运行程序时都以同样的方式进行分割
mnb_pipeline = make_pipeline(vect, mnb)
svm_pipeline = make_pipeline(vect, svm)
mnb_cv = cross_val_score(mnb_pipeline, title, label, scoring='accuracy', cv=10, n_jobs=-1)
svm_cv = cross_val_score(svm_pipeline, title, label, scoring='accuracy', cv=10, n_jobs=-1)
print('\nMultinomialNB Classifier\'s Accuracy: %0.5f\n' % mnb_cv.mean())
print('\nSVM Classificer\'s Accuracy: %0.5f\n' % svm_cv.mean())


MultinomialNB Classifier's Accuracy: 0.83542


SVM Classificer's Accuracy: 0.75599



In [5]:
y_labels = list(y_train.value_counts().index)
le = preprocessing.LabelEncoder()
le.fit(y_labels)
num_labels = len(y_labels)
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), num_labels)
y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), num_labels)

In [6]:
# load glove word embedding data
GLOVE_DIR = "./glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [7]:
# take tokens and build word-in dictionary
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(title)
vocab = tokenizer.word_index

In [8]:
# Match the word vector for each word in the data set from Glove
embedding_matrix = np.zeros((len(vocab)+1, 300))
for word, i in vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
# Match the input format of the model
x_train_word_ids = tokenizer.texts_to_sequences(X_train)               #序列的列表，列表中每个序列对应于一段输入文本
x_test_word_ids = tokenizer.texts_to_sequences(X_test)
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=20)                #将序列转化为经过填充以后的一个长度相同的新序列
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=20)

In [10]:
# one-hot mlp
x_train = tokenizer.sequences_to_matrix(x_train_word_ids, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test_word_ids, mode='binary')

In [11]:
model = Sequential()
model.add(Dense(512, input_shape=(len(vocab)+1,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_labels, activation='softmax'))

model.compile(loss='categorical_crossentropy',
                                 optimizer='adam',
                                 metrics=['accuracy'])

model.fit(x_train, y_train,
                     batch_size=32,
                     epochs=15,
                     validation_data=(x_test, y_test))

Train on 17154 samples, validate on 1907 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7fee1b411f60>

In [12]:
# X_predict = ["who was the american general in the pacific during world war ii","where do guyanese people live","what is magic johnsons dads name"]
# model = load_model('./model/DenseModel.h5')
# x_predict_word_ids = tokenizer.texts_to_sequences(X_predict)
# x_predict = tokenizer.sequences_to_matrix(x_predict_word_ids, mode='binary')
# predict_test = model.predict(x_predict)
# print(np.argmax(predict_test,axis=1))

In [13]:
loss, accuracy = model.evaluate(x_test, y_test)
print('\ntest loss: ', loss)
print('\ntest accuracy: ', accuracy)


test loss:  0.1839279934978059

test accuracy:  0.9669638276100159


In [None]:
loss, accuracy = model.evaluate(x_train, y_train)
print('\ntest loss: ', loss)
print('\ntest accuracy: ', accuracy)

In [14]:
model.save('./model/DenseModel.h5')

In [15]:
# RNN model
model = Sequential()
model.add(Embedding(len(vocab)+1, 256, input_length=20))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(num_labels, activation='softmax'))

model.compile(loss='categorical_crossentropy',
                                 optimizer='adam',
                                 metrics=['accuracy'])

model.fit(x_train_padded_seqs, y_train,
                     batch_size=32,
                     epochs=12,
                     validation_data=(x_test_padded_seqs, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17154 samples, validate on 1907 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.callbacks.History at 0x7fed74044f98>

In [16]:
loss, accuracy = model.evaluate(x_test_padded_seqs, y_test)
print('\ntest loss: ', loss)
print('\ntest accuracy: ', accuracy)


test loss:  0.17920124676290022

test accuracy:  0.9727320671081543


In [17]:
model.save('./model/RNNModel.h5')

In [18]:
# CNN model
model = Sequential()
model.add(Embedding(len(vocab)+1, 256, input_length=20))

# Convolutional moedl (3x conv, flatten, 2x dense)
model.add(Convolution1D(256, 3, padding='same'))
model.add(MaxPool1D(3, 3, padding='same'))
model.add(Convolution1D(128, 3, padding='same'))
model.add(MaxPool1D(3, 3, padding='same'))
model.add(Convolution1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(num_labels, activation='softmax'))

model.compile(loss = 'categorical_crossentropy',
                                 optimizer='adam',
                                 metrics=['accuracy'])

model.fit(x_train_padded_seqs, y_train,
                     batch_size=32,
                     epochs=12,
                     validation_data=(x_test_padded_seqs, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17154 samples, validate on 1907 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.callbacks.History at 0x7fed308f6048>

In [19]:
loss, accuracy = model.evaluate(x_test_padded_seqs, y_test)
print('\ntest loss: ', loss)
print('\ntest accuracy: ', accuracy)


test loss:  0.1512668517655798

test accuracy:  0.9758783578872681


In [20]:
model.save('./model/CNNModel.h5')

In [21]:
# TextCNN
main_input = Input(shape=(20,), dtype='float64')
embedder = Embedding(len(vocab) + 1, 300, input_length = 20)
embed = embedder(main_input)
cnn1 = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn1 = MaxPool1D(pool_size=4)(cnn1)
cnn2 = Convolution1D(256, 4, padding='same', strides = 1, activation='relu')(embed)
cnn2 = MaxPool1D(pool_size=4)(cnn2)
cnn3 = Convolution1D(256, 5, padding='same', strides = 1, activation='relu')(embed)
cnn3 = MaxPool1D(pool_size=4)(cnn3)
cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(num_labels, activation='softmax')(drop)
model = Model(inputs = main_input, outputs = main_output)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train_padded_seqs, y_train,
          batch_size=32,
          epochs=12,
          validation_data=(x_test_padded_seqs, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17154 samples, validate on 1907 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.callbacks.History at 0x7fed2bafa208>

In [22]:
loss, accuracy = model.evaluate(x_test_padded_seqs, y_test)
print('\ntest loss: ', loss)
print('\ntest accuracy: ', accuracy)


test loss:  0.3062174772522096

test accuracy:  0.966439425945282


In [23]:
model.save('./model/TextCNNModel.h5')