### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [251]:
train, dev, test = [], [], []

In [252]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [253]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [254]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


#### You can split every sentence into lists of words by white spaces.

In [255]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]
train_split[:]

[[0,
  ['lkêcê',
   'yoúc',
   'cêêö',
   'y#êjl',
   'lw',
   'mówám',
   'Újám',
   'j',
   'Úêê#',
   'ütlk',
   'Úol',
   'lkêú',
   'z#ê',
   'ctöé8ú',
   'ówl',
   'xoóóú',
   'éê#xw#öê#c',
   '.']],
 [0,
  ['6êcétlê',
   'jolêot8',
   'zc',
   'éê#xw#öjóáê',
   ',',
   'tl',
   'zc',
   'j',
   '#jlkê#',
   '8tcl8êcc',
   'jöÚ8ê',
   '6wüó',
   'lkê',
   'öt668ê',
   'wx',
   'lkê',
   '#wj6',
   ',',
   'ükê#ê',
   'lkê',
   'lkêöjltá',
   't#wótêc',
   'j#ê',
   'lww',
   'wÚ2twoc',
   'jó6',
   'lkê',
   'cê+oj8',
   'éw8tltác',
   'lww',
   'cöoy',
   '.']],
 [0,
  ['tx',
   'lktc',
   'kw8t6jú',
   'öw2tê',
   'tc',
   'coééwcê6',
   'lw',
   'Úê',
   'j',
   'ytxl',
   ',',
   'cwöêÚw6ú',
   'oóü#jééê6',
   'tl',
   'êj#8ú',
   ',',
   'lwwm',
   'wol',
   'j88',
   'lkê',
   'yww6',
   'cloxx',
   ',',
   'jó6',
   '8êxl',
   'Úêktó6',
   'lkê',
   'á#jé',
   '(',
   '8tlê#j88ú',
   ')',
   '.']],
 [1,
  ['vocl',
   'ükêó',
   'úwo',
   'lktóm',
   'lkjl',
   'ê2ê#ú',
   

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [256]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []
train_split[:2]

[[0,
  ['lkêcê',
   'yoúc',
   'cêêö',
   'y#êjl',
   'lw',
   'mówám',
   'Újám',
   'j',
   'Úêê#',
   'ütlk',
   'Úol',
   'lkêú',
   'z#ê',
   'ctöé8ú',
   'ówl',
   'xoóóú',
   'éê#xw#öê#c',
   '.']],
 [0,
  ['6êcétlê',
   'jolêot8',
   'zc',
   'éê#xw#öjóáê',
   ',',
   'tl',
   'zc',
   'j',
   '#jlkê#',
   '8tcl8êcc',
   'jöÚ8ê',
   '6wüó',
   'lkê',
   'öt668ê',
   'wx',
   'lkê',
   '#wj6',
   ',',
   'ükê#ê',
   'lkê',
   'lkêöjltá',
   't#wótêc',
   'j#ê',
   'lww',
   'wÚ2twoc',
   'jó6',
   'lkê',
   'cê+oj8',
   'éw8tltác',
   'lww',
   'cöoy',
   '.']]]

In [257]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.preprocessing import sequence
from tensorflow.keras import optimizers
# fix random seed for reproducibility
numpy.random.seed(7)

In [258]:
train_sentence = [el[1] for el in train_split]
train_sentence[:2]

[['lkêcê',
  'yoúc',
  'cêêö',
  'y#êjl',
  'lw',
  'mówám',
  'Újám',
  'j',
  'Úêê#',
  'ütlk',
  'Úol',
  'lkêú',
  'z#ê',
  'ctöé8ú',
  'ówl',
  'xoóóú',
  'éê#xw#öê#c',
  '.'],
 ['6êcétlê',
  'jolêot8',
  'zc',
  'éê#xw#öjóáê',
  ',',
  'tl',
  'zc',
  'j',
  '#jlkê#',
  '8tcl8êcc',
  'jöÚ8ê',
  '6wüó',
  'lkê',
  'öt668ê',
  'wx',
  'lkê',
  '#wj6',
  ',',
  'ükê#ê',
  'lkê',
  'lkêöjltá',
  't#wótêc',
  'j#ê',
  'lww',
  'wÚ2twoc',
  'jó6',
  'lkê',
  'cê+oj8',
  'éw8tltác',
  'lww',
  'cöoy',
  '.']]

In [259]:
from gensim.models import FastText
min_freq, max_freq = 1, 8000
fastText_model = FastText(train_sentence, vector_size=100, window=10, min_count=min_freq, sg=1)

In [260]:
fastText_weights = fastText_model.wv.vectors
ft_vocab_size, ft_embedding_size = fastText_weights.shape
print(ft_vocab_size,ft_embedding_size)
vocab = list(fastText_model.wv.index_to_key)
len(vocab)

20860 100


20860

In [261]:
word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = fastText_model.wv.get_vector(word)
print(len(word_vec_dict))
len(word_vec_dict['.'])

20860


100

In [262]:
maxList = max(train_sentence, key = lambda i: len(i))
max_sentence_length = len(maxList)
print(max_sentence_length, maxList)

56 ['lkê', 'xt8ö', 'tc', 'xjtlkxo8', 'lw', 'ükjl', 'wóê', 'é#êcoöêc', 'j#ê', 'lkê', 'Úwwm', 'zc', 'lütó', 'é#êötcêc', '77', 'lkjl', 'üê', 'Úêáwöê', 'ükw', 'üê', 'j#ê', 'wó', 'lkê', 'Újámc', 'wx', 'wo#', 'éj#êólc', ',', 'Úol', 'üê', 'kj2ê', 'ów', 't6êj', 'ükw', 'lkêú', 'üê#ê', 'jl', 'wo#', 'jyê', '.', 'jó6', 'lkjl', 'ltöê', 'tc', 'j', 'x8êêltóy', 'jó6', 'é#êátwoc', 'áwööw6tlú', 'ów', 'öjllê#', 'kwü', 'w86', 'úwo', 'j#ê', '.']


In [263]:
X_train = [el[1] for el in train_split]
y_train = [el[0] for el in train_split]
X_train[:2]
y_train[:4]
len(X_train)

16220

In [264]:
from collections import defaultdict
train_dict = defaultdict(int)
for i,sentence in enumerate(X_train):
    for word in sentence:
        train_dict[word] += 1

print(len(train_dict))

20860


In [265]:
train_dict = {k:v for k, v in train_dict.items() if v >= min_freq and v<=max_freq}

In [266]:
rank_words = {key: rank for rank, key in enumerate(sorted(train_dict, key=train_dict.get, reverse=True), 1)}
print(len(rank_words))
vocab_size = len(rank_words)

20854


In [267]:
X_train_encoded = []
for i,sentence in enumerate(X_train):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_train_encoded.append(encoded_sentence)

len(X_train_encoded[0])


18

In [268]:
X_dev = [el[1] for el in dev_split]
y_test = [el[0] for el in dev_split]
len(X_dev)

2027

In [269]:
X_dev_encoded = []
for i,sentence in enumerate(X_dev):
    encoded_sentence = []
    for word in sentence:
        encoded_sentence.append(rank_words.get(word,0))
    X_dev_encoded.append(encoded_sentence)

len(X_dev_encoded)

2027

In [270]:
# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train_encoded, maxlen=max_sentence_length)
X_test = sequence.pad_sequences(X_dev_encoded, maxlen=max_sentence_length)
X_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  192, 4527,    8,  161,    0,    3,    8,    0,  210,
       2067, 9247,  182,    0,  826,    0,    0,  860,    0,  194,    0,
       2527, 6076,   23,   40,  395,    0,    0,  802,  884,   40, 1649,
          0])

In [271]:
import numpy as np
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [272]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size+1,ft_embedding_size))
for i,word in enumerate(rank_words):
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i+1]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.
print(embed_matrix.shape)

(20855, 100)


In [273]:
print(fastText_model.wv.get_vector('y#êjl'))
embed_matrix[rank_words['y#êjl']]

[ 2.99916476e-01  4.86548066e-01  2.81650797e-02 -5.71055221e-04
 -7.15972558e-02 -3.14473584e-02  1.08756751e-01  4.10555333e-01
 -5.59675097e-01 -4.85722721e-01  2.92265058e-01 -1.84340253e-01
 -1.66028365e-01 -2.97755361e-01  5.00719488e-01 -2.02754736e-01
  4.73825097e-01 -1.13772839e-01 -2.99668133e-01 -4.10774171e-01
 -2.74848163e-01 -2.19498444e-02  2.32314751e-01  8.70155022e-02
 -7.32306466e-02  3.13655525e-01  2.65668809e-01  1.74930230e-01
  9.86336768e-02  3.97967726e-01  4.27890301e-01  1.99096113e-01
  8.02607015e-02 -2.92748939e-02 -6.64343476e-01  2.10962132e-01
  6.80322573e-02 -5.70518412e-02 -4.10445243e-01  5.39128482e-01
 -1.02729954e-01 -1.61691174e-01 -1.18758403e-01 -6.56721368e-02
  3.90330970e-01  1.10211536e-01 -1.02822319e-01 -1.76290259e-01
  8.44496116e-02  2.24692479e-01  3.55044335e-01 -1.46226749e-01
 -4.43498164e-01  1.75676152e-01 -4.32710320e-01 -1.45691261e-01
 -3.85827012e-02 -3.01994473e-01 -1.37443855e-01  1.33751243e-01
  2.16852929e-02  3.46500

array([ 2.99916476e-01,  4.86548066e-01,  2.81650797e-02, -5.71055221e-04,
       -7.15972558e-02, -3.14473584e-02,  1.08756751e-01,  4.10555333e-01,
       -5.59675097e-01, -4.85722721e-01,  2.92265058e-01, -1.84340253e-01,
       -1.66028365e-01, -2.97755361e-01,  5.00719488e-01, -2.02754736e-01,
        4.73825097e-01, -1.13772839e-01, -2.99668133e-01, -4.10774171e-01,
       -2.74848163e-01, -2.19498444e-02,  2.32314751e-01,  8.70155022e-02,
       -7.32306466e-02,  3.13655525e-01,  2.65668809e-01,  1.74930230e-01,
        9.86336768e-02,  3.97967726e-01,  4.27890301e-01,  1.99096113e-01,
        8.02607015e-02, -2.92748939e-02, -6.64343476e-01,  2.10962132e-01,
        6.80322573e-02, -5.70518412e-02, -4.10445243e-01,  5.39128482e-01,
       -1.02729954e-01, -1.61691174e-01, -1.18758403e-01, -6.56721368e-02,
        3.90330970e-01,  1.10211536e-01, -1.02822319e-01, -1.76290259e-01,
        8.44496116e-02,  2.24692479e-01,  3.55044335e-01, -1.46226749e-01,
       -4.43498164e-01,  

In [274]:
# create the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1,output_dim=ft_embedding_size,input_length=max_sentence_length,embeddings_initializer=Constant(embed_matrix)))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fdd22ac460>

In [275]:
from keras.layers import Bidirectional
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1,output_dim=ft_embedding_size,input_length=max_sentence_length,embeddings_initializer=Constant(embed_matrix)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(50)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fdb5f05160>

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [None]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [None]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [None]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')