## This file is about how to use convolutional neural network to build a classifier

In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re

In [2]:
from bs4 import BeautifulSoup

In [3]:
import sys
import os

In [4]:
import keras

CNTK is the backend of Keras. It can also be switched to tensorflow.

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense,Input,Flatten
from keras.layers import Conv1D, MaxPooling1D,Dropout,Concatenate
from keras.models import Model,Sequential

Using CNTK backend


## (1)Set global parameters

In [6]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

## (2) Data Input
Data are from [IMDB Dataset](https://www.kaggle.com/c/word2vec-nlp-tutorial/data)

In [7]:
data_train = pd.read_csv('data/labeledTrainData.tsv',sep='\t')
data_train.shape

(25000, 3)

In [8]:
data_train[0:3]

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


## (3) Data preprocess -- remove some characters
Use BeautifulSoup to remove some html tags and remove some unwanted characters.

In [9]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\","",string)
    string = re.sub(r"\'","",string)
    string = re.sub(r"\"","",string)
    return string.strip().lower()

texts=[]
labels=[]
for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx],'lxml')
    texts.append(clean_str(text.get_text()))
    labels.append(data_train.sentiment[idx])

## (3) Data preprocess -- data and label
Use Keras function to process the data

In [10]:
tokenizer=Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 81501 unique tokens.


Sequences is a list of list, which contains 25000 reviews. Each review is a list of its words.
Then pads each sequence to the same length.

In [11]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [12]:
labels = to_categorical(np.asarray(labels))

In [13]:
print('Shape of data tensor: ', data.shape)
print('Shape of label tensor: ',labels.shape)

Shape of data tensor:  (25000, 1000)
Shape of label tensor:  (25000, 2)


## (3) Data preprocess -- train data and validation data

In [14]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [15]:
print ('Number of negative and positive reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of negative and positive reviews in training and validation set
[  9946.  10054.]
[ 2554.  2446.]


## (4) Create CNN model

In [16]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(35))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         8150200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         
__________

## (5) Training

In [17]:
print("--------------model fitting - convolutional 1D neural network---------------")
print("--the process of fitting is ignored here")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=128)

--------------model fitting - convolutional 1D neural network---------------
--the process of fitting is ignored here


## (6) A more complex CNN model
In Yoon Kim’s paper, multiple filters have been applied.

In [18]:
#-----------------------Complex CNN -------------------------
print ('---Start to run Complex CNN model--------------:')

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

convs = []
filter_sizes = [3,4,5]

for fsz in filter_sizes:
    l_conv = Conv1D(filters=128,kernel_size=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_cov1 = Conv1D(128,5,activation='relu')(l_merge)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128,5,activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128,activation='relu')(l_flat)
out = Dense(2,activation='softmax')(l_dense)

model2 = Model(sequence_input,out)
model2.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - complex CNN network")
model2.summary()

---Start to run Complex CNN model--------------:
model fitting - complex CNN network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1000, 100)     8150200     input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_4 (Conv1D)                (None, 998, 128)      38528       embedding_2[0][0]                
____________________________________________________________________________________________________
conv1d_5 (Conv1D)                (None, 997, 128)      51328       embedding_2[0][0]                
______

In [19]:
print("---------------model fitting - complex CNN network--------------")
print("----------The fitting process is ignored here.")
model2.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=50)

---------------model fitting - complex CNN network--------------
----------The fitting process is ignored here.
