In [1]:
import mxnet as mx
import gluonnlp as nlp

# Local Libraries
# Load Model locally
import textcnn
# Load helper functions
import utils

ctx = mx.gpu()

[nltk_data] Downloading package punkt to /home/andres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
bert_model, vocab = nlp.model.get_model(
    'bert_12_768_12',
    dataset_name='book_corpus_wiki_en_uncased',
    use_classifier=False,
    use_decoder=False,
    ctx=ctx)



In [3]:
tokenizer = nlp.data.BERTTokenizer(vocab, lower=True)
transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False)
sample = transform(["man"])
words, valid_len, segments = mx.nd.array([sample[0]], ctx=ctx), mx.nd.array([sample[1]], ctx=ctx), mx.nd.array([sample[2]], ctx=ctx)
seq_encoding, _ = bert_model(words, segments, valid_len)

In [4]:
embed_size = seq_encoding.shape[2]
print(embed_size)

768


In [5]:
# Loading the dataset
full_train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
                               for segment in ('train', 'test')]

# Dataset Sizes
print("Size of Train Set:", len(full_train_dataset), " (no valid split yet)")
print("Size of Test  Set:", len(test_dataset))

Size of Train Set: 25000  (no valid split yet)
Size of Test  Set: 25000


In [6]:
print(test_dataset[0][0])

I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.


In [7]:
# Filtering for size
# Starting data samples are positive reviews, and ending data samples are negative reviews
# Retrieving a balanced sample of both sentiments
full_train_dataset = full_train_dataset[:3125] + full_train_dataset[-3125:]
test_dataset = test_dataset[:625] + test_dataset[-625:]
print("Size of Train Set (Post-filter):", len(full_train_dataset), " (no valid split yet)")
print("Size of Test  Set (Post-filter):", len(test_dataset))

Size of Train Set (Post-filter): 6250  (no valid split yet)
Size of Test  Set (Post-filter): 1250


In [8]:
# Dataset Split 40/10/50
# Test dataset at 50% is given by library, validation dataset is 20% of train dataset
from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(full_train_dataset, test_size=int(0.2*len(full_train_dataset)), random_state=42)

In [9]:
# Final Dataset Sizes
print("Size of Train Set:", len(train_dataset))
print("Size of Val   Set:", len(val_dataset))
print("Size of Test  Set:", len(test_dataset))

Size of Train Set: 5000
Size of Val   Set: 1250
Size of Test  Set: 1250


In [10]:
# Each input in a batch, needs to have the same length
# We need to apply a batchify function to pad with zeros (if necessary)
# On the output, we just stack them together to get a batch-size array
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=1, pad_val=0),
                                      nlp.data.batchify.Stack())

In [14]:
from importlib import reload
reload(textcnn)

# CNN with 3 parallel filters
text_cnn = textcnn.TextCNN([3, 4, 5], [embed_size, embed_size, embed_size])
text_cnn.initialize(mx.init.MSRAPrelu(), ctx)

# Formatting single input as expected for the network
direct_embeddings_test = transform([test_dataset[0][0]])
words = mx.nd.array([direct_embeddings_test[0]], ctx=ctx)
valid_len = mx.nd.array([direct_embeddings_test[1]], ctx=ctx)
segments = mx.nd.array([direct_embeddings_test[2]], ctx=ctx)
seq_output, _ = bert_model(words, segments, valid_len)
seq_output_reshaped = seq_output.transpose(axes=[0, 2, 1])

print(seq_output_reshaped.shape)

text_cnn.summary(seq_output_reshaped)

(1, 768, 179)
--------------------------------------------------------------------------------
        Layer (type)                                Output Shape         Param #
               Input                               (1, 768, 179)               0
        Activation-1                     <Symbol conv3_relu_fwd>               0
        Activation-2                               (1, 768, 177)               0
            Conv1D-3                               (1, 768, 177)         1770240
   GlobalMaxPool1D-4                                 (1, 768, 1)               0
        Activation-5                     <Symbol conv4_relu_fwd>               0
        Activation-6                               (1, 768, 176)               0
            Conv1D-7                               (1, 768, 176)         2360064
   GlobalMaxPool1D-8                                 (1, 768, 1)               0
        Activation-9                     <Symbol conv5_relu_fwd>               0
       Activat

In [16]:
review_sentiment = text_cnn(seq_output_reshaped)
# We can omit sigmoid processing, outputs of the network
# with positive values are positive reviews
if review_sentiment >= 0:
    print(review_sentiment, "The review is positive")
else:
    print(review_sentiment, "The review is negative")


[[3.7410083]]
<NDArray 1x1 @gpu(0)> The review is positive


In [17]:
# Dataset processing
train_set = []
for review, score in train_dataset:
    # Processing inputs & applying embeddings
    direct_embeddings_test = transform([review])
    words = mx.nd.array([direct_embeddings_test[0]], ctx=ctx)
    valid_len = mx.nd.array([direct_embeddings_test[1]], ctx=ctx)
    segments = mx.nd.array([direct_embeddings_test[2]], ctx=ctx)
    seq_output, _ = bert_model(words, segments, valid_len)
    embeddings = seq_output.transpose(axes=[0, 2, 1])

    # A negative review has a score <= 4
    # A positive review has a score >= 7 out of 10
    sentiment = int(score > 5)
    train_set.append((embeddings, sentiment))

In [18]:
val_set = []
for review, score in val_dataset:
    # Processing inputs & applying embeddings
    direct_embeddings_test = transform([review])
    words = mx.nd.array([direct_embeddings_test[0]], ctx=ctx)
    valid_len = mx.nd.array([direct_embeddings_test[1]], ctx=ctx)
    segments = mx.nd.array([direct_embeddings_test[2]], ctx=ctx)
    seq_output, _ = bert_model(words, segments, valid_len)
    embeddings = seq_output.transpose(axes=[0, 2, 1])
    
    # A negative review has a score <= 4
    # A positive review has a score >= 7 out of 10
    sentiment = int(score > 5)
    val_set.append((embeddings, sentiment))

In [19]:
epochs = 5
batch_size = 4

model_file_name = "bert_textcnn.params"

loss_fn = mx.gluon.loss.SigmoidBCELoss()
trainer = mx.gluon.Trainer(text_cnn.collect_params(), "adam", {"learning_rate": 0.001})

training_loss, validation_loss, validation_acc = text_cnn.train(
    loss_fn,
    trainer,
    epochs,
    batch_size,
    train_set,
    val_set,  
    batchify_fn,
    ctx,
    model_file_name)

MXNetError: Traceback (most recent call last):
  File "../src/storage/./pooled_storage_manager.h", line 161
MXNetError: cudaMalloc retry failed: out of memory

In [None]:
review_sentiment = text_cnn(seq_output_reshaped)
# We can omit sigmoid processing, outputs of the network
# with positive values are positive reviews
if review_sentiment >= 0:
    print(review_sentiment, "The review is positive")
else:
    print(review_sentiment, "The review is negative")

In [None]:
# plot losses and validation accuracy
epochs = 5

plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, epochs), validation_loss[:epochs], label="Validation Loss")
plt.plot(np.arange(0, epochs), training_loss[:epochs], label="Training Loss")
plt.plot(np.arange(0, epochs), validation_acc[:epochs], label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(loc="upper right")
plt.title("Losses")
plt.show()

In [None]:
# Quantitative Evaluation on Test Set
test_set = []

# Limiting test set size to 5000 samples for memory constrains
# It is encouraged to increase this value up to 25000 (actual size of test set)
# if your memory permits
for review, score in test_dataset:
    # Processing inputs & applying embeddings
    direct_embeddings_test = transform([review])
    words = mx.nd.array([direct_embeddings_test[0]], ctx=ctx)
    valid_len = mx.nd.array([direct_embeddings_test[1]], ctx=ctx)
    segments = mx.nd.array([direct_embeddings_test[2]], ctx=ctx)
    seq_output, _ = bert_model(words, segments, valid_len)
    embeddings = seq_output.transpose(axes=[0, 2, 1])
    
    # A negative review has a score <= 4
    # A positive review has a score >= 7 out of 10
    sentiment = int(score > 5)
    test_set.append((embeddings, sentiment))

In [None]:
# Processing Test dataset for confussion matrix
num_test_batches = len(test_set) / batch_size

# Iterator on Test dataset
test_data_iterator = mx.gluon.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    batchify_fn=batchify_fn)

In [None]:
# Confussion Matrix Computation (Test set)
class_outputs = mx.nd.empty(shape=(len(test_set),))
labels = mx.nd.empty(shape=(len(test_set),))

test_acc = mx.metric.Accuracy()
cumulative_test_loss = 0

for index, (data, label) in enumerate(tqdm(test_data_iterator)):
    
    # Add labels for Confussion Matrix
    labels[index * batch_size:(index + 1) * batch_size] = label
                
    # Processing data from data iterator
    data_np = data.as_np_ndarray().as_in_context(ctx)
    label_np = label.as_np_ndarray().as_in_context(ctx)

    output_np = text_cnn(data_np)
    test_loss = loss_fn(output_np, label_np)
    current_test_loss = mx.np.mean(test_loss)
    cumulative_test_loss += current_test_loss / num_test_batches

    # Accuracy
    # Comparison between  labels and values output
    # Applying threshold for binary classification
    # No sigmoid necessary as outputs of the network
    # with positive values are positive reviews
    class_output = (output_np.as_nd_ndarray() >= 0).astype("uint8").transpose()
    class_outputs[index * batch_size:(index + 1) * batch_size] = class_output
    test_acc.update(label.as_in_context(ctx), class_output[0])

test_acc_value = test_acc.get()[1]
print("Final Test Accuracy:", test_acc_value)

In [None]:
# Plot the CM
confusion_matrix = metrics.confusion_matrix(labels.asnumpy(), class_outputs.asnumpy())
disp = metrics.ConfusionMatrixDisplay(confusion_matrix)
disp.plot()