# Importing packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
TRAIN_ENC_PATH = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/3_CiphertextClassification/train_enc.tsv'
DEV_ENC_PATH = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/3_CiphertextClassification/dev_enc.tsv'
TEST_ENC_PATH = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/3_CiphertextClassification/test_enc_unlabeled.tsv'

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

torch.manual_seed(1)

Device: cuda


<torch._C.Generator at 0x7f9807de9510>

In [4]:
from tqdm.notebook import tqdm
import time

# Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [5]:
train, dev, test = [], [], []

In [6]:
for x in open(TRAIN_ENC_PATH, encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print(len(train))
print(train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [7]:
for x in open(DEV_ENC_PATH, encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0])
    dev.append(x)
print(len(dev))
print(dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


In [8]:
for x in open(TEST_ENC_PATH, encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print(len(test))
print(test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


## Create word to id dictionary

In [9]:
word_to_ix = {'<OOV>': 0}

In [10]:
for tags, sent in train:
    for word in sent.split(' '):
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'<OOV>': 0, 'lkêcê': 1, 'yoúc': 2, 'cêêö': 3, 'y#êjl': 4, 'lw': 5, 'mówám': 6, 'Újám': 7, 'j': 8, 'Úêê#': 9, 'ütlk': 10, 'Úol': 11, 'lkêú': 12, 'z#ê': 13, 'ctöé8ú': 14, 'ówl': 15, 'xoóóú': 16, 'éê#xw#öê#c': 17, '.': 18, '6êcétlê': 19, 'jolêot8': 20, 'zc': 21, 'éê#xw#öjóáê': 22, ',': 23, 'tl': 24, '#jlkê#': 25, '8tcl8êcc': 26, 'jöÚ8ê': 27, '6wüó': 28, 'lkê': 29, 'öt668ê': 30, 'wx': 31, '#wj6': 32, 'ükê#ê': 33, 'lkêöjltá': 34, 't#wótêc': 35, 'j#ê': 36, 'lww': 37, 'wÚ2twoc': 38, 'jó6': 39, 'cê+oj8': 40, 'éw8tltác': 41, 'cöoy': 42, 'tx': 43, 'lktc': 44, 'kw8t6jú': 45, 'öw2tê': 46, 'tc': 47, 'coééwcê6': 48, 'Úê': 49, 'ytxl': 50, 'cwöêÚw6ú': 51, 'oóü#jééê6': 52, 'êj#8ú': 53, 'lwwm': 54, 'wol': 55, 'j88': 56, 'yww6': 57, 'cloxx': 58, '8êxl': 59, 'Úêktó6': 60, 'á#jé': 61, '(': 62, '8tlê#j88ú': 63, ')': 64, 'vocl': 65, 'ükêó': 66, 'úwo': 67, 'lktóm': 68, 'lkjl': 69, 'ê2ê#ú': 70, 'éwcctÚ8ê': 71, 'jóy8ê': 72, 'kjc': 73, 'Úêêó': 74, 'ê+kjoclê6': 75, 'Úú': 76, '6wáoöêólj#tjóc': 77, 'jówlkê#': 78, 

In [11]:
def train_prepare_sequence(seq, to_ix):
  idxs = [to_ix[w] for w in seq]
  return torch.tensor(idxs, dtype=torch.long, device=device)

In [12]:
def prepare_sequence(seq, to_ix):
  idxs = [to_ix[w] if w in to_ix else 0 for w in seq]
  return torch.tensor(idxs, dtype=torch.long, device=device)

## Creating Dataset and Dataloader

In [13]:
BATCH_SIZE = 64

In [14]:
class CipherTextDataset(Dataset):
  def __init__(self, cipher_corpus):
    self.corpus = cipher_corpus
  
  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    text = self.corpus[idx][1]
    label = self.corpus[idx][0]
    return text, label

class CipherTextTestDataset(Dataset):
  def __init__(self, cipher_corpus):
    self.corpus = cipher_corpus
  
  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    text = self.corpus[idx]
    return text

In [15]:
def train_collate_batch(batch):
  label_list, text_list, offsets = [], [], [0]
  for (_text, _label) in batch:
    label_list.append(int(_label))
    processed_text = train_prepare_sequence(_text.split(' '), word_to_ix)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.int64, device=device)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return text_list.to(device), label_list.to(device), offsets.to(device)

def dev_collate_batch(batch):
  label_list, text_list, offsets = [], [], [0]
  for (_text, _label) in batch:
    label_list.append(int(_label))
    processed_text = prepare_sequence(_text.split(' '), word_to_ix)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.int64, device=device)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return text_list.to(device), label_list.to(device), offsets.to(device)

def test_collate_batch(batch):
  text_list, offsets = [], [0]
  for _text in batch:
    processed_text = prepare_sequence(_text.split(' '), word_to_ix)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return text_list.to(device), offsets.to(device)

In [16]:
train_dataset = CipherTextDataset(train)
dev_dataset = CipherTextDataset(dev)
test_dataset = CipherTextTestDataset(test)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_collate_batch)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=dev_collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=test_collate_batch)

### Test dataloaders

In [17]:
train_texts, train_labels, train_offsets = next(iter(train_dataloader))
print(train_texts)
print(train_labels)
print(train_texts.shape)
print(train_labels.shape)
print(train_offsets)

tensor([8532,   23, 1187,  ..., 1741, 2058,   18], device='cuda:0')
tensor([1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], device='cuda:0')
torch.Size([1291])
torch.Size([64])
tensor([   0,   34,   38,   63,   69,   96,  106,  119,  145,  155,  187,  193,
         203,  233,  255,  268,  283,  309,  340,  362,  378,  393,  404,  410,
         419,  451,  472,  511,  533,  561,  597,  638,  656,  691,  709,  726,
         763,  786,  809,  814,  821,  854,  875,  916,  937,  951,  973,  987,
        1009, 1027, 1040, 1074, 1095, 1115, 1129, 1142, 1147, 1187, 1209, 1221,
        1237, 1252, 1270, 1286], device='cuda:0')


In [18]:
dev_texts, dev_labels, dev_offsets = next(iter(dev_dataloader))
print(dev_texts)
print(dev_labels)
print(dev_texts.shape)
print(dev_labels.shape)
print(dev_offsets)

tensor([  37, 2313,  285,  ..., 7367, 2150,   18], device='cuda:0')
tensor([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1], device='cuda:0')
torch.Size([1210])
torch.Size([64])
tensor([   0,    7,   22,   48,   76,  101,  135,  159,  182,  196,  222,  241,
         285,  306,  324,  353,  369,  374,  383,  402,  409,  418,  423,  436,
         465,  490,  524,  532,  536,  541,  562,  589,  601,  616,  644,  667,
         683,  694,  720,  731,  750,  764,  781,  800,  821,  859,  866,  875,
         884,  904,  926,  947,  951,  960,  982,  995,  999, 1016, 1028, 1074,
        1097, 1122, 1145, 1177], device='cuda:0')


In [19]:
test_texts, test_offsets = next(iter(test_dataloader))
print(test_texts)
print(test_texts.shape)
print(test_offsets)

tensor([   8, 8587,   39,  ..., 8134, 1257,   18], device='cuda:0')
torch.Size([1165])
tensor([   0,    9,   39,   58,  103,  110,  138,  160,  181,  206,  216,  220,
         236,  249,  270,  284,  304,  309,  337,  370,  379,  384,  404,  410,
         417,  444,  468,  495,  518,  534,  548,  571,  595,  602,  607,  631,
         666,  679,  709,  735,  748,  770,  781,  798,  830,  833,  844,  853,
         865,  868,  873,  907,  924,  941,  950,  978,  989,  999, 1020, 1039,
        1095, 1116, 1127, 1147], device='cuda:0')


# Creating the model

## Simple FFNN Model

In [20]:
vocab_size = len(word_to_ix)
num_class = 2
EMBEDDING_DIM = 64

EPOCHS = 75
LR = 0.5

In [21]:
class SimpleFFNNClassifier(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super(SimpleFFNNClassifier, self).__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.fc = nn.Linear(embed_dim, num_class)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [22]:
model = SimpleFFNNClassifier(vocab_size, EMBEDDING_DIM, num_class).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [23]:
with torch.no_grad():
  input_texts, input_labels, input_offsets = next(iter(train_dataloader))
  class_scores = model(input_texts, input_offsets)
  print(class_scores.shape)
  print(input_labels.shape)
  print(input_labels[0])
  print(class_scores[0].argmax(0) == input_labels[0])
  print(loss_function(class_scores, input_labels))

torch.Size([64, 2])
torch.Size([64])
tensor(0, device='cuda:0')
tensor(True, device='cuda:0')
tensor(0.6872, device='cuda:0')


### Training model

In [24]:
def train_model(dataloader):
  model.train()
  total_acc, total_count = 0, 0
  log_interval = 32
  start_time = time.time()

  for idx, (text, label, offset) in enumerate(dataloader):
    optimizer.zero_grad()
    predicted_label = model(text, offset)
    loss = loss_function(predicted_label, label)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()
    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)
    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches '
            '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                        total_acc/total_count))
      total_acc, total_count = 0, 0
      start_time = time.time()

In [25]:
def evaluate_model(dataloader):
  model.eval()
  total_acc, total_count = 0, 0

  with torch.no_grad():
    for idx, (text, label, offset) in enumerate(dataloader):
      predicted_label = model(text, offset)
      loss = loss_function(predicted_label, label)
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)
  return total_acc/total_count

In [26]:
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_model(train_dataloader)
    accu_val = evaluate_model(dev_dataloader)
    # scheduler.step()
    # if total_accu is not None and total_accu > accu_val:
    #   scheduler.step()
    # else:
    #    total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |    32/  254 batches | accuracy    0.503
| epoch   1 |    64/  254 batches | accuracy    0.513
| epoch   1 |    96/  254 batches | accuracy    0.530
| epoch   1 |   128/  254 batches | accuracy    0.525
| epoch   1 |   160/  254 batches | accuracy    0.562
| epoch   1 |   192/  254 batches | accuracy    0.564
| epoch   1 |   224/  254 batches | accuracy    0.586
-----------------------------------------------------------
| end of epoch   1 | time:  1.78s | valid accuracy    0.581 
-----------------------------------------------------------
| epoch   2 |    32/  254 batches | accuracy    0.580
| epoch   2 |    64/  254 batches | accuracy    0.599
| epoch   2 |    96/  254 batches | accuracy    0.564
| epoch   2 |   128/  254 batches | accuracy    0.592
| epoch   2 |   160/  254 batches | accuracy    0.597
| epoch   2 |   192/  254 batches | accuracy    0.611
| epoch   2 |   224/  254 batches | accuracy    0.606
-----------------------------------------------------------
| e

### Model predictions on test

In [27]:
def predict_model(dataloader):
  model.eval()

  results = torch.tensor([]).to(device)
  with torch.no_grad():
    for idx, (text, offset) in enumerate(dataloader):
      predicted_label = model(text, offset)
      results = torch.cat((results, predicted_label.argmax(1)))
    return results

In [28]:
# Eventually, results need to be a list of 2028 0 or 1's
results = predict_model(test_dataloader)
print(results.shape)

torch.Size([2028])


# Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [29]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [30]:
# make sure the results are not float numbers, but integers 0 and 1
results = [int(x) for x in results]

In [31]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
  for x in results:
    fp.write(str(x) + '\n')