### Imports

In [1]:
import os
import collections
import re
import math

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

Instructions for updating:
non-resource variables are not supported in the long term


### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [2]:
TRAIN_PATH = './train_enc.tsv'
DEV_PATH = './dev_enc.tsv'
TEST_PATH = './test_enc_unlabeled.tsv'

In [3]:
df_train = pd.read_csv(TRAIN_PATH, sep='\t', names=['label', 'content'])
df_valid = pd.read_csv(DEV_PATH, sep='\t', names=['label', 'content'])
df_test = pd.read_csv(TEST_PATH, sep='\t', names=['content'])

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

#### Word RNN Model

In [4]:
class WordCNN(object):
    def __init__(self, vocabulary_size, document_max_len, num_class):
        self.embedding_size = 128
        self.learning_rate = 1e-3
        self.filter_sizes = [3, 4, 5]
        self.num_filters = 100

        self.x = tf.placeholder(tf.int32, [None, document_max_len], name="x")
        self.y = tf.placeholder(tf.int32, [None], name="y")
        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        with tf.name_scope("embedding"):
            init_embeddings = tf.random_uniform([vocabulary_size, self.embedding_size])
            self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings)
            self.x_emb = tf.nn.embedding_lookup(self.embeddings, self.x)
            self.x_emb = tf.expand_dims(self.x_emb, -1)

        pooled_outputs = []
        for filter_size in self.filter_sizes:
            conv = tf.layers.conv2d(
                self.x_emb,
                filters=self.num_filters,
                kernel_size=[filter_size, self.embedding_size],
                strides=(1, 1),
                padding="VALID",
                activation=tf.nn.relu)
            pool = tf.layers.max_pooling2d(
                conv,
                pool_size=[document_max_len - filter_size + 1, 1],
                strides=(1, 1),
                padding="VALID")
            pooled_outputs.append(pool)

        h_pool = tf.concat(pooled_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, self.num_filters * len(self.filter_sizes)])

        with tf.name_scope("dropout"):
            h_drop = tf.nn.dropout(h_pool_flat, self.keep_prob)

        with tf.name_scope("output"):
            self.logits = tf.layers.dense(h_drop, num_class, activation=None)
            self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32)
            self.yhat = self.predictions

        with tf.name_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y))
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)

        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, self.y)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

#### Dataset Utility Functions

In [5]:
def clean_str(text):
    text = re.sub(r"\s{2,}", " ", text)
    text = text[:-1].strip() if text[-1] == "." else text
    return text

In [6]:
def build_word_dict():
    
    contents = df_train['content']
    # contents.extend(df_valid['content'])
    
    words = list()
    for content in contents:
        if type(content)==str:
            for word in word_tokenize(clean_str(content)):
                words.append(word)
                
    word_counter = collections.Counter(words)########.most_common() ##removed because parameter n not mentioned
    word_dict = dict()
    word_dict['<pad>'] = 0
    word_dict['<unk>'] = 1
    word_dict['<eos>'] = 2
    
    for word in word_counter.keys():
        word_dict[word] = len(word_dict)
    
    return word_dict

In [13]:
def build_word_dataset(step, word_dict, word_max_len):
    if step == "test":
        df = df_test.copy()
        df = df.sample(frac=1)
        
        x = list(map(lambda d: word_tokenize(clean_str(d)), df['content']))
        x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict['<unk>']), d)), x))
        x = list(map(lambda d: d + [word_dict['<eos>']], x))
        x = list(map(lambda d: d[:word_max_len], x))
        x = list(map(lambda d: d + (word_max_len - len(d)) * [word_dict['<pad>']], x))
        
        y = [0] * len(x)
        
        return x, y
        
    else:
        if step == "train":
            df = df_train.copy()
        elif step == "valid":
            df = df_valid.copy()
        df = df.sample(frac=1)
        
        x = list(map(lambda d: word_tokenize(clean_str(d)), df['content']))
        x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict['<unk>']), d)), x))
        x = list(map(lambda d: d + [word_dict['<eos>']], x))
        x = list(map(lambda d: d[:word_max_len], x))
        x = list(map(lambda d: d + (word_max_len - len(d)) * [word_dict['<pad>']], x))

        y = list(df['label'])

        return x, y

In [8]:
def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)
    
    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]
            
def batch_iter_testing(inputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    
    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index]

#### Training and Validation

In [9]:
NUM_CLASS = 14
BATCH_SIZE = 64
NUM_EPOCHS = 20
WORD_MAX_LEN = 100
# CHAR_MAX_LEN = 1014

In [10]:
word_dict = build_word_dict()
vocabulary_size = len(word_dict)
# print(vocabulary_size)

train_x, train_y = build_word_dataset("train", word_dict, WORD_MAX_LEN)
valid_x, valid_y = build_word_dataset("valid", word_dict, WORD_MAX_LEN)

######## Should we create word_dict with both train and dev? Can combine dev and train to train on both together and then cross validation

In [11]:
with tf.Session() as sess:
    model = WordCNN(vocabulary_size, WORD_MAX_LEN, NUM_CLASS)

    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())

    train_batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS)
    num_batches_per_epoch = (len(train_x) - 1) // BATCH_SIZE + 1
    max_accuracy = 0

    for x_batch, y_batch in train_batches:
        train_feed_dict = {
            model.x: x_batch,
            model.y: y_batch,
            model.is_training: True
        }

        _, step, loss = sess.run([model.optimizer, model.global_step, model.loss], feed_dict=train_feed_dict)

        if step % 100 == 0:
            print("step {0}: loss = {1}".format(step, loss))

        if step % 2000 == 0:
            # Test accuracy with validation data for each epoch.
            valid_batches = batch_iter(valid_x, valid_y, BATCH_SIZE, 1)
            sum_accuracy, cnt = 0, 0

            for valid_x_batch, valid_y_batch in valid_batches:
                valid_feed_dict = {
                    model.x: valid_x_batch,
                    model.y: valid_y_batch,
                    model.is_training: False
                }

                accuracy = sess.run(model.accuracy, feed_dict=valid_feed_dict)
                sum_accuracy += accuracy
                cnt += 1
            valid_accuracy = sum_accuracy / cnt

            print("\nValidation Accuracy = {1}\n".format(step // num_batches_per_epoch, sum_accuracy / cnt))

            # Save model
            if valid_accuracy > max_accuracy:
                max_accuracy = valid_accuracy
                saver.save(sess, "{0}/{1}.ckpt".format("word_cnn", "word_cnn"), global_step=step)
                print("Model is saved.\n")

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
step 100: loss = 0.6865599155426025
step 200: loss = 0.6825381517410278
step 300: loss = 0.7103176116943359
step 400: loss = 0.7036516070365906
step 500: loss = 0.6873151063919067
step 600: loss = 0.6337301731109619
step 700: loss = 0.6281212568283081
step 800: loss = 0.44779330492019653
step 900: loss = 0.4462973475456238
step 1000: loss = 0.256725937128067
step 1100: loss = 0.29837557673454285
step 1200: loss = 0.19972418248653412
step 1300: loss = 0.22813834249973297
step 1400: loss = 0.34471723437309265
step 1500: loss = 0.16021543741226196
step 1600: loss = 0.20096100866794586
step 1700: loss = 0.1795511245727539
step 1800: loss = 0.21925139427185059
step 1900: loss = 0.1650944948196411
step 2000: loss = 0.13141098618507385

Validation Accuracy = 0.8530386984348297

Model is saved.

step 2100: loss = 0.16051077842712402
step 2200: loss = 0.10236914455890656
step 2300:

#### Prediction

In [14]:
BATCH_SIZE = 128

test_x, test_y = build_word_dataset("test", word_dict, WORD_MAX_LEN)
checkpoint_file = tf.train.latest_checkpoint("word_cnn")

graph = tf.Graph()
with graph.as_default():
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        x = graph.get_operation_by_name("x").outputs[0]
        y = graph.get_operation_by_name("y").outputs[0]

        is_training = graph.get_operation_by_name("is_training").outputs[0]
        accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]
        
        results = []
        
        batches = batch_iter(test_x, test_y, BATCH_SIZE, 1)
        for batch_x, batch_y in batches:
            feed_dict = {
                x: batch_x,
                y: batch_y,
                is_training: False
            }
            
            # accuracy_out = sess.run(accuracy, feed_dict=feed_dict)
            prediction = graph.get_tensor_by_name("output/ArgMax:0")
            pred = sess.run(prediction, feed_dict=feed_dict)
            results.extend(pred)
            
print(len(results))
print(results)

INFO:tensorflow:Restoring parameters from word_cnn/word_cnn.ckpt-4000
2028
[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [15]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
# those results are in the list called 'results'
assert (len(results) == 2028)

In [16]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [17]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions_word_cnn.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')