### Imports

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

from torchtext.vocab import Vocab

from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 

import pandas as pd
import numpy as np
from numpy import log, sqrt

from tqdm import tqdm
from matplotlib import pyplot as plt

import time
from collections import Counter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [3]:
TRAIN_PATH = 'data/train_enc.tsv'
DEV_PATH = 'data/dev_enc.tsv'
TEST_PATH = 'data/test_enc_unlabeled.tsv'

In [4]:
train_df = pd.read_csv(TRAIN_PATH, sep='\t', names=['label', 'content'])
train_result =list(train_df.to_records(index=False))
num_train = len(train_result)

val_df = pd.read_csv(DEV_PATH, sep='\t', names=['label', 'content'])
val_result =list(val_df.to_records(index=False))
num_val = len(val_result)

test_df = pd.read_csv(TEST_PATH, sep='\t', names=['content'])
test_result =list(test_df.to_records(index=False))

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

#### Positional Encoding and Transformer Model with BiLSTM

In [5]:
class PositionalEncoding(nn.Module):
    """Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    Math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
    
    def __init__(self, n_class, embed, nhead, nhid, nlayers, max_seq=15, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        # self.model_type = 'Transformer'
        self.src_mask = None
        self.ninp = embed.shape[1]
        
        self.pos_encoder = PositionalEncoding(self.ninp, dropout)
        encoder_layers = TransformerEncoderLayer(self.ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = torch.nn.Embedding.from_pretrained(embed, freeze=False)
        self.decoder = nn.Linear(self.ninp*max_seq, n_class)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        # nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=False):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None
        src = self.encoder(src.long()) * sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)   # [batch_size, seq_length, embed_dim]
        word_att = output
        scores = self.decoder(output.view(output.size(0), -1))        # (batch_size, seq_length * embed_dim) -> (batch_size, n_classes)
        # return F.log_softmax(output, dim=-1)
        return scores, word_att  # (batch_size, n_classes) & (batch_size, seq_length, emb_dim)
    
class RNN(nn.Module):
    def __init__(self, embed, hidden_dim1,  n_class, dropout=0):
        super(RNN, self).__init__()
        vocab_dim, embed_dim = embed.shape
        self.embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
        self.rnn = nn.LSTM(embed_dim, hidden_dim1, batch_first=True, num_layers=2, dropout=dropout)
        self.rnn = nn.GRU(embed_dim, hidden_dim1, batch_first=True, num_layers=1)
        self.fc1 = nn.Linear(hidden_dim1, n_class)
        self.drop = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, text):
        x = self.embedding(text)
        out, ht = self.rnn(x)
        out = self.drop(out[:, -1, :])        
        out = self.fc1(out)
        return self.act(out)

#### Utility Functions

In [6]:
def collate_fn1(batch): # Train
    text_list, label_list = [], []
    for (label_, text_) in batch:
        label_list.append(label_)
        process_text = torch.tensor(text_pipeline(text_), dtype=torch.int64)
        text_list.append(process_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return label_list.to(device), text_list.to(device)

def collate_fn2(batch):# Test
    text_list= []
    for tup in batch:
        text_ = tup[0]
        process_text = torch.tensor(text_pipeline(text_), dtype=torch.int64)
        text_list.append(process_text)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return text_list.to(device)

def count_parameters(model):
    num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return num_parameters

#### Pre-processing and hyperparameter tuning

In [7]:
# extract vocab from pre-trained embedding vocab
tokenizer = RegexpTokenizer(r'\w+')

counter = Counter()

l = []
for title in train_df['content']:
    tokens = tokenizer.tokenize(title)
    counter.update(tokens)
    l.append(tokens)
vocab = Vocab(counter, min_freq=2, vectors='glove.6B.300d')
embed = vocab.vectors

# hyper-parameter
model = RNN(embed, hidden_dim1=64, n_class=5, dropout = 0.5)
lr = 0.001
epoch = 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, gamma=0.1, milestones=[3,6,10])

text_pipeline = lambda x: [vocab[token] for token in tokenizer.tokenize(x)]

#### Training and Validation

In [8]:
# batch up training data 
train_loader = torch.utils.data.DataLoader(train_result, batch_size=4, shuffle=True, collate_fn=collate_fn1)
val_loader = torch.utils.data.DataLoader(val_result, batch_size=4, shuffle=False, collate_fn=collate_fn1)

In [9]:
loss_fn = nn.CrossEntropyLoss()
epoch_train_loss = []
epoch_train_acc = []
epoch_val_loss = []
epoch_val_acc = []

for epoch_idx in range(1, epoch+1):
    # print(scheduler.get_last_lr())
    print('==== epoch {} ===='.format(epoch_idx))
    epoch_start = time.time()
    accurate_count = 0
    batch_loss = 0
    
    model.train()
    for batch_idx, (label, text) in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        
        # forward
        out = model(text)
        
        # cal loss
        loss = loss_fn(out, label)
        batch_loss += loss
        
        # Backpropagation (BP)
        loss.backward()
        optimizer.step()
 
        # calculate accuracy
        _, predicted = torch.max(out, 1)
        # print('predicted: ', predicted, 'label: ', label)
        tmp = np.count_nonzero((predicted==label).cpu().detach().numpy())
        accurate_count += tmp
        # End of Train    
    epoch_train_loss.append(batch_loss.detach().numpy()/(batch_idx+1))
    epoch_train_acc.append(round(accurate_count/num_train, 7)*100)    

    # validation
    val_loss = 0
    val_acc_count = 0
    model.eval()        
    with torch.no_grad():
        for batch_idx, (label, text) in enumerate(tqdm(val_loader)):
            # forward
            out = model(text)
            
            # cal loss
            loss = loss_fn(out, label)
            val_loss += loss
                        
            # calculate accuracy
            _, predicted = torch.max(out, 1)
            valtmp = np.count_nonzero((predicted==label).cpu().detach().numpy())
            val_acc_count += valtmp    
            
    scheduler.step()
    epoch_val_loss.append(val_loss.detach().numpy()/(batch_idx+1))
    epoch_val_acc.append(round(val_acc_count/num_val, 7)*100)       
    print('================================')
    print('training loss: {}'.format(epoch_train_loss[-1]))
    print('training acc: {}%'.format(epoch_train_acc[-1]))
    print('validation loss: {}'.format((epoch_val_loss[-1])))
    print('validation acc: {}%'.format(epoch_val_acc[-1]))
    
    per_epoch_time = time.time() - epoch_start
    print('train and test cost {} seconds'.format(per_epoch_time))
print('\nFinished.')

  0%|          | 3/4055 [00:00<02:55, 23.04it/s]

==== epoch 1 ====


100%|██████████| 4055/4055 [01:59<00:00, 34.00it/s]
100%|██████████| 507/507 [00:00<00:00, 526.01it/s]
  0%|          | 4/4055 [00:00<02:00, 33.72it/s]

training loss: 0.5602561339203145
training acc: 69.92602%
validation loss: 0.3787185097116926
validation acc: 83.86778%
train and test cost 120.22848081588745 seconds
==== epoch 2 ====


100%|██████████| 4055/4055 [02:12<00:00, 30.55it/s]
100%|██████████| 507/507 [00:00<00:00, 525.74it/s]
  0%|          | 4/4055 [00:00<02:07, 31.69it/s]

training loss: 0.26362572610010787
training acc: 90.25277%
validation loss: 0.33943113067446373
validation acc: 86.28515%
train and test cost 135.31686305999756 seconds
==== epoch 3 ====


100%|██████████| 4055/4055 [02:06<00:00, 32.04it/s]
100%|██████████| 507/507 [00:00<00:00, 523.13it/s]
  0%|          | 4/4055 [00:00<02:04, 32.47it/s]

training loss: 0.16659718241850338
training acc: 94.18619%
validation loss: 0.3473000912036181
validation acc: 88.01183999999999%
train and test cost 129.2291579246521 seconds
==== epoch 4 ====


100%|██████████| 4055/4055 [02:17<00:00, 29.42it/s]
100%|██████████| 507/507 [00:01<00:00, 499.00it/s]
  0%|          | 4/4055 [00:00<02:04, 32.65it/s]

training loss: 0.09028265373333462
training acc: 96.9852%
validation loss: 0.4106502796301005
validation acc: 88.20918%
train and test cost 140.55921506881714 seconds
==== epoch 5 ====


100%|██████████| 4055/4055 [02:37<00:00, 25.77it/s]
100%|██████████| 507/507 [00:01<00:00, 446.09it/s]


training loss: 0.07987382655961005
training acc: 97.21948%
validation loss: 0.42200122359236314
validation acc: 88.40651000000001%
train and test cost 160.27647590637207 seconds
==== epoch 6 ====


100%|██████████| 4055/4055 [02:16<00:00, 29.77it/s]
100%|██████████| 507/507 [00:00<00:00, 518.67it/s]
  0%|          | 4/4055 [00:00<02:03, 32.80it/s]

training loss: 0.07059125229991908
training acc: 97.43527%
validation loss: 0.43676971495739336
validation acc: 88.55451%
train and test cost 139.18735790252686 seconds
==== epoch 7 ====


100%|██████████| 4055/4055 [02:10<00:00, 30.98it/s]
100%|██████████| 507/507 [00:00<00:00, 511.29it/s]
  0%|          | 3/4055 [00:00<02:16, 29.65it/s]

training loss: 0.06057963153730926
training acc: 97.78052000000001%
validation loss: 0.4424288597332655
validation acc: 88.55451%
train and test cost 133.557137966156 seconds
==== epoch 8 ====


100%|██████████| 4055/4055 [02:19<00:00, 29.09it/s]
100%|██████████| 507/507 [00:01<00:00, 491.34it/s]
  0%|          | 4/4055 [00:00<02:08, 31.43it/s]

training loss: 0.06041657886434572
training acc: 97.78052000000001%
validation loss: 0.447921060716377
validation acc: 88.60385%
train and test cost 142.33232283592224 seconds
==== epoch 9 ====


100%|██████████| 4055/4055 [02:17<00:00, 29.44it/s]
100%|██████████| 507/507 [00:01<00:00, 492.93it/s]
  0%|          | 3/4055 [00:00<02:21, 28.61it/s]

training loss: 0.05861143965903399
training acc: 97.78052000000001%
validation loss: 0.45335216635077663
validation acc: 88.65317999999999%
train and test cost 140.59218788146973 seconds
==== epoch 10 ====


100%|██████████| 4055/4055 [02:14<00:00, 30.23it/s]
100%|██████████| 507/507 [00:01<00:00, 490.08it/s]
  0%|          | 4/4055 [00:00<02:04, 32.65it/s]

training loss: 0.05694825799191777
training acc: 97.84834000000001%
validation loss: 0.45840749044625245
validation acc: 88.60385%
train and test cost 137.03507614135742 seconds
==== epoch 11 ====


100%|██████████| 4055/4055 [02:14<00:00, 30.19it/s]
100%|██████████| 507/507 [00:00<00:00, 519.97it/s]
  0%|          | 3/4055 [00:00<02:19, 29.07it/s]

training loss: 0.056858556632195974
training acc: 97.89149%
validation loss: 0.45886931710929796
validation acc: 88.65317999999999%
train and test cost 137.09126591682434 seconds
==== epoch 12 ====


100%|██████████| 4055/4055 [02:13<00:00, 30.33it/s]
100%|██████████| 507/507 [00:01<00:00, 497.40it/s]
  0%|          | 3/4055 [00:00<03:21, 20.12it/s]

training loss: 0.05649235698356581
training acc: 97.9963%
validation loss: 0.4593448676536304
validation acc: 88.60385%
train and test cost 136.50937414169312 seconds
==== epoch 13 ====


100%|██████████| 4055/4055 [02:19<00:00, 29.06it/s]
100%|██████████| 507/507 [00:00<00:00, 511.02it/s]
  0%|          | 3/4055 [00:00<02:19, 29.05it/s]

training loss: 0.05568555650816893
training acc: 97.89766%
validation loss: 0.4597881249422152
validation acc: 88.60385%
train and test cost 142.40285897254944 seconds
==== epoch 14 ====


100%|██████████| 4055/4055 [02:11<00:00, 30.86it/s]
100%|██████████| 507/507 [00:00<00:00, 525.83it/s]
  0%|          | 3/4055 [00:00<02:16, 29.60it/s]

training loss: 0.057272944703202065
training acc: 97.92232%
validation loss: 0.46013498400325137
validation acc: 88.65317999999999%
train and test cost 134.20865511894226 seconds
==== epoch 15 ====


100%|██████████| 4055/4055 [02:21<00:00, 28.64it/s]
100%|██████████| 507/507 [00:00<00:00, 510.39it/s]
  0%|          | 3/4055 [00:00<02:18, 29.34it/s]

training loss: 0.058204840908215164
training acc: 97.82366999999999%
validation loss: 0.46054775380994206
validation acc: 88.65317999999999%
train and test cost 144.2464029788971 seconds
==== epoch 16 ====


100%|██████████| 4055/4055 [02:18<00:00, 29.34it/s]
100%|██████████| 507/507 [00:00<00:00, 518.75it/s]
  0%|          | 4/4055 [00:00<02:02, 33.19it/s]

training loss: 0.057144172564975726
training acc: 97.873%
validation loss: 0.46094132222132334
validation acc: 88.65317999999999%
train and test cost 141.12920808792114 seconds
==== epoch 17 ====


100%|██████████| 4055/4055 [02:12<00:00, 30.51it/s]
100%|██████████| 507/507 [00:01<00:00, 494.28it/s]
  0%|          | 3/4055 [00:00<02:38, 25.56it/s]

training loss: 0.057279842202672245
training acc: 97.836%
validation loss: 0.4613965277135725
validation acc: 88.65317999999999%
train and test cost 135.63008213043213 seconds
==== epoch 18 ====


100%|██████████| 4055/4055 [02:20<00:00, 28.76it/s]
100%|██████████| 507/507 [00:01<00:00, 489.15it/s]
  0%|          | 4/4055 [00:00<02:13, 30.30it/s]

training loss: 0.05668388018625732
training acc: 97.99014%
validation loss: 0.46173396665433925
validation acc: 88.65317999999999%
train and test cost 143.87544798851013 seconds
==== epoch 19 ====


100%|██████████| 4055/4055 [02:26<00:00, 27.77it/s]
100%|██████████| 507/507 [00:01<00:00, 453.64it/s]
  0%|          | 3/4055 [00:00<02:17, 29.44it/s]

training loss: 0.056708689359201606
training acc: 97.89766%
validation loss: 0.46220211352587215
validation acc: 88.70251999999999%
train and test cost 149.07496786117554 seconds
==== epoch 20 ====


100%|██████████| 4055/4055 [04:32<00:00, 14.89it/s]
100%|██████████| 507/507 [00:03<00:00, 165.63it/s]


training loss: 0.05569202503058531
training acc: 97.92232%
validation loss: 0.4625662779196715
validation acc: 88.70251999999999%
train and test cost 277.3453950881958 seconds

Finished.





#### Prediction on Test Data

In [10]:
# Batch up test data
test_loader = torch.utils.data.DataLoader(test_result, batch_size=1, shuffle=False, collate_fn=collate_fn2)

test_output = [] 
with torch.no_grad():
    for input_tensor in tqdm(test_loader):
        # forward
        out = model(input_tensor)
        _, predicted = torch.max(out, 1)
        # print('predicted: ', predicted)
        test_output.append(int(predicted.cpu().detach().numpy()))

# print("Test Output: ", test_output)
results = test_output

100%|██████████| 2028/2028 [00:02<00:00, 883.43it/s]


### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [11]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
# those results are in the list called 'results'
assert (len(results) == 2028)

In [12]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [13]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions_transformer_pytorch.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')

In [14]:
# REFERENCES:
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
# https://atheros.ai/blog/text-classification-with-transformers-in-tensorflow-2