# Introduction: Document Classification with a CNN

In this notebook, we will use a one-dimensional convolutional neural network to classify questions. This is an old method that used to be popular but has since given way to Recurrent networks with LSTM cells.

In [1]:
# Only want to use one gpu
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # "0, 1" for multiple

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

from utils import load_data

seq_arr, test_seq_arr, labels, word_index, index_word, vs, embedding_matrix = load_data('word', 'glove')
seq_arr.shape, test_seq_arr.shape, embedding_matrix.shape

['/device:GPU:0']


Using TensorFlow backend.


((1099063, 30), (56370, 30), (59728, 300))

In [4]:
from utils import f1
import re

from timeit import default_timer as timer
from keras.utils import multi_gpu_model
from keras import optimizers
from keras.models import *
from keras.layers import *
from keras.callbacks import *

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


import numpy as np
import pandas as pd

## Word-Level Model

In [5]:
sequence_input = Input(shape=(seq_arr.shape[1],), dtype='int32')

embedding_layer = Embedding(embedding_matrix.shape[0], 
                             embedding_matrix.shape[1],
                             weights = [embedding_matrix],
                             name = 'embedding')

embedded_sequences = embedding_layer(sequence_input)

l_cov1= Conv1D(128, 2, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(2)(l_cov1)

l_cov2 = Conv1D(128, 2, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(1)(l_cov2)

l_cov3 = Conv1D(128, 3, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(11)(l_cov3)

l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
l_dense = Dropout(0.5)(l_dense)
preds = Dense(1, activation='sigmoid')(l_dense)
                          
model = Model(inputs = [sequence_input], output = [preds])

model.compile(optimizer = optimizers.Adam(), loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy', 'accuracy', f1])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 300)           17918400  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 29, 128)           76928     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 14, 128)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 13, 128)           32896     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 13, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 11, 128)           49280     
__________



In [6]:
model_name = 'cnn_word'

# Create callbacks
callback_list = [EarlyStopping(monitor = 'val_loss', patience = 4),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                                 save_best_only = True)]

history = model.fit(seq_arr, labels, validation_split = 0.4,
                    epochs = 25, batch_size = 1024,
                    callbacks = callbacks_list)

Train on 659437 samples, validate on 439626 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [11]:
def load_data(data_name, embedding_name):
    if embedding_name == 'wiki':
        if data_name == 'word':
            embedding_matrix = np.load('word_wiki_embeddings.npy')
    elif embedding_name == 'glove':
        if data_name == 'word':
            embedding_matrix = np.load('word_glove_embeddings.npy')
            
    if data_name == 'word':
        seq_arr = np.load('word_sequences.npy')
        test_seq_arr = np.load('test_word_sequences.npy')
        labels = np.load('word_labels.npy')
        iw = []
        with open('word_index_word.json', 'r') as f:
            for l in f:
                iw.append(json.loads(l))

        index_word = iw[0]
        index_word = {int(key): word for key, word in index_word.items()}

        wi = []
        with open('word_word_index.json', 'r') as f:
            for l in f:
                wi.append(json.loads(l))

        word_index = wi[0]
        word_index = {word: int(index) for word, index in word_index.items()}
            
        vs = len(word_index)
        
    elif data_name == 'char':
        seq_arr = np.load('char_sequences.npy')
        test_seq_arr = np.load('test_char_sequences.npy')
        labels = np.load('char_labels.npy')
        iw = []
        with open('char_index_word.json', 'r') as f:
            for l in f:
                iw.append(json.loads(l))

        index_word = iw[0]
        index_word = {int(key): word for key, word in index_word.items()}

        wi = []
        with open('char_word_index.json', 'r') as f:
            for l in f:
                wi.append(json.loads(l))

        word_index = wi[0]
        word_index = {word: int(index) for word, index in word_index.items()}
        
        vs = len(word_index)
        
        embedding_matrix = np.zeros((vs, 100))
            
    return seq_arr, test_seq_arr, labels, word_index, index_word, vs, embedding_matrix

In [12]:
seq_arr, test_seq_arr, labels, word_index, index_word, vs, embedding_matrix = load_data('char', embedding_name = None)

In [13]:
sequence_input = Input(shape=(seq_arr.shape[1],), dtype='int32')

embedding_layer = Embedding(embedding_matrix.shape[0], 
                             embedding_matrix.shape[1],
                             weights = [embedding_matrix],
                             name = 'embedding')

embedded_sequences = embedding_layer(sequence_input)

l_cov1= Conv1D(128, 2, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(2)(l_cov1)

l_cov2 = Conv1D(128, 2, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(1)(l_cov2)

l_cov3 = Conv1D(128, 3, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(11)(l_cov3)

l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)

l_dense = Dropout(0.5)(l_dense)
preds = Dense(1, activation='sigmoid')(l_dense)
                          
model = Model(inputs = [sequence_input], output = [preds])

model.compile(optimizer = optimizers.Adam(), loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy', 'accuracy', f1])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          32300     
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 149, 128)          25728     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 74, 128)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 73, 128)           32896     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 73, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 71, 128)           49280     
__________



In [15]:
model_name = 'cnn_char'

# Create callbacks
callback_list = [EarlyStopping(monitor = 'val_loss', patience = 4),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                                 save_best_only = True)]

history = model.fit(seq_arr, labels, validation_split = 0.4,
                    epochs = 25, batch_size = 1024,
                    callbacks = callback_list)

Train on 708671 samples, validate on 472448 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25


In [18]:
sequence_input = Input(shape=(seq_arr.shape[1],), dtype='int32')

embedding_layer = Embedding(embedding_matrix.shape[0], 
                             embedding_matrix.shape[1],
                             weights = [embedding_matrix],
                             name = 'embedding')

embedded_sequences = embedding_layer(sequence_input)

l_cov1= Conv1D(256, 2, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(2)(l_cov1)

l_cov2 = Conv1D(512, 2, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(2)(l_cov2)

l_cov3 = Conv1D(256, 3, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(5)(l_cov3)

l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
l_dense = Dropout(0.5)(l_dense)

l_dense = Dense(64, activation = 'relu')(l_dense)
l_dense = Dropout(0.5)(l_dense)

preds = Dense(1, activation='sigmoid')(l_dense)
                          
model = Model(inputs = [sequence_input], output = [preds])

model.compile(optimizer = optimizers.Adam(), loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy', 'accuracy', f1])

model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 100)          32300     
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 149, 256)          51456     
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 74, 256)           0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 73, 512)           262656    
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 36, 512)           0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 34, 256)           393472    
__________

In [None]:
model_name = 'cnn_char'

# Create callbacks
callback_list = [EarlyStopping(monitor = 'val_loss', patience = 5),
                 ModelCheckpoint(f'models/{model_name}.h5', monitor = 'val_loss',
                                 save_best_only = True)]

history = model.fit(seq_arr, labels, validation_split = 0.4,
                    epochs = 25, batch_size = 1024,
                    callbacks = callback_list)

Train on 708671 samples, validate on 472448 samples
Epoch 1/25
  7168/708671 [..............................] - ETA: 2:34 - loss: 0.6920 - binary_crossentropy: 0.6920 - acc: 0.9478 - f1: 0.0000e+00