### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [1]:
import string
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
train, dev, test = [], [], [] 
train_labels, dev_labels = [], [] 
train_text, dev_text = [], [] 
clean_train, clean_dev = [], []

* #### Training Data Preparation

In [3]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [4]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0])
    train_labels.append(x[0])
print(len(train_labels))
print(train_labels[:10]) 

16220
[0, 0, 0, 1, 1, 1, 1, 1, 1, 0]


In [5]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[1] = str(x[1])
    train_text.append(x[1])
print(len(train_text))
print(train_text[:5])

16220
['lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .', '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .', 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .', 'vocl ükêó úwo lktóm lkjl ê2ê#ú éwcctÚ8ê jóy8ê kjc Úêêó ê+kjoclê6 Úú 6wáoöêólj#tjóc , jówlkê# óêü xt8ö êöê#yêc ütlk úêl jówlkê# #êöj#mjÚ8ê úêl ckwámtóy8ú 8tll8ê7mówüó éê#céêált2ê .', 'yt2ê á#ê6tl lw ê2ê#úwóê x#wö #wÚtócwó 6wüó lw lkê mêú y#té lkjl lktc Úw86 öw2ê üw#mc .']


In [6]:
for i in train_text:
    i = i.replace('.', '')
    i = i.replace(',', '')
    i = i.replace('  ', ' ')
    i = i.rstrip()
    clean_train.append(i)
print(len(clean_train))
print(clean_train[:5])

16220
['lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c', '6êcétlê jolêot8 zc éê#xw#öjóáê tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy', 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl cwöêÚw6ú oóü#jééê6 tl êj#8ú lwwm wol j88 lkê yww6 cloxx jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú )', 'vocl ükêó úwo lktóm lkjl ê2ê#ú éwcctÚ8ê jóy8ê kjc Úêêó ê+kjoclê6 Úú 6wáoöêólj#tjóc jówlkê# óêü xt8ö êöê#yêc ütlk úêl jówlkê# #êöj#mjÚ8ê úêl ckwámtóy8ú 8tll8ê7mówüó éê#céêált2ê', 'yt2ê á#ê6tl lw ê2ê#úwóê x#wö #wÚtócwó 6wüó lw lkê mêú y#té lkjl lktc Úw86 öw2ê üw#mc']


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train)
train_sequence = tokenizer.texts_to_sequences(clean_train)

In [8]:
train_padded = pad_sequences(train_sequence, padding = 'post')

In [9]:
print(train_padded)
print(type(train_padded))

[[ 268  778  338 ...    0    0    0]
 [ 262 4411   17 ...    0    0    0]
 [  61   21 1570 ...    0    0    0]
 ...
 [   2    2  240 ...    0    0    0]
 [3834 4236   70 ...    0    0    0]
 [9920    4  496 ...    0    0    0]]
<class 'numpy.ndarray'>


* #### Validation Data Preparation

In [10]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


In [11]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev_labels.append(x[0])
print(len(dev_labels))
print(dev_labels[:10]) 

2027
[1, 0, 1, 1, 1, 1, 1, 0, 0, 0]


In [12]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[1] = str(x[1]) 
    dev_text.append(x[1])
print(len(dev_text))
print(dev_text[:5]) 

2027
['ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<', 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .', 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .', 'tl kjc j ájxxêtójlê6 , c8wééú Ú#t88tjóáê , céj#m8tóy ütlk t6êjc úwo ütck kj6 Úêêó 6ê2ê8wéê6 ütlk öw#ê áj#ê , Úol jótöjlê6 Úú jó êóê#yú lkjl éolc lkê 6oltxo8 êxxw#lc wx öw#ê 6tcáté8tóê6 y#j6ê7y#oÚÚê#c lw ckjöê .', 'lww öoák wx clw#úlê88tóy öw2êc jüjú x#wö cw8wó6azc cwátj8 á#tlt!oê , ájcltóy tlc jo6têóáê jc lkjl wx tólê88êáloj8 8êálw# tó áwólêöé8jltwó wx lkê jolêo#zc é#wxêcctwój8 tóvo#têc .']


In [13]:
for i in dev_text:
    i = i.replace('.', '')
    i = i.replace(',', '')
    i = i.replace('  ', ' ')
    i = i.rstrip()
    clean_dev.append(i)
print(len(clean_dev))
print(clean_dev[:5])

2027
['ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<', 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l', 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj#', 'tl kjc j ájxxêtójlê6 c8wééú Ú#t88tjóáê céj#m8tóy ütlk t6êjc úwo ütck kj6 Úêêó 6ê2ê8wéê6 ütlk öw#ê áj#ê Úol jótöjlê6 Úú jó êóê#yú lkjl éolc lkê 6oltxo8 êxxw#lc wx öw#ê 6tcáté8tóê6 y#j6ê7y#oÚÚê#c lw ckjöê', 'lww öoák wx clw#úlê88tóy öw2êc jüjú x#wö cw8wó6azc cwátj8 á#tlt!oê ájcltóy tlc jo6têóáê jc lkjl wx tólê88êáloj8 8êálw# tó áwólêöé8jltwó wx lkê jolêo#zc é#wxêcctwój8 tóvo#têc']


In [14]:
tokenizer_2 = Tokenizer()
tokenizer_2.fit_on_texts(clean_dev)
dev_sequence = tokenizer.texts_to_sequences(clean_dev)

In [15]:
dev_padded = pad_sequences(dev_sequence, maxlen = 64, padding = 'post')

In [16]:
print(dev_padded)
print(type(dev_padded))

[[ 6423     8 14936 ...     0     0     0]
 [   78   424  1266 ...     0     0     0]
 [   84  1290  5582 ...     0     0     0]
 ...
 [    2    13  1312 ...     0     0     0]
 [   72   908    60 ...     0     0     0]
 [    6   346    11 ...     0     0     0]]
<class 'numpy.ndarray'>


* #### Test Data Preparation

In [17]:
clean_test = [] 

In [18]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


In [19]:
for i in test:
    i = i.replace('.', '')
    i = i.replace(',', '')
    i = i.replace('  ', ' ')
    i = i.rstrip()
    clean_test.append(i)
print(len(clean_test))
print(clean_test[:5])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú', 'ówlktóy cltámc #êj88ú ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 cw#6t6 oót2ê#cê wx yoóc 6#oyc j2j#táê jó6 6jöjyê6 6#êjöc', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê', 'lkê xt8ö üjc é#w6oáê6 Úú vê##ú Ú#oámkêtöê# jó6 6t#êálê6 Úú vwê8 cákoöjákê# jó6 #êx8êálc lkê üw#cl wx lkêt# ckj88wü clú8êc 1 üt868ú w2ê#é#w6oáê6 tój6ê!ojlê8ú öwlt2jlê6 ê2ê#ú clêé wx lkê üjú jó6 6êöwy#jéktáj88ú lj#yêlê6 lw é8êjcê ê2ê#ú wóê ( jó6 ów wóê )', 'tl zc cé8jck ütlkwol lkê vwmêc']


In [20]:
tokenizer_3 = Tokenizer()
tokenizer_3.fit_on_texts(clean_test)
test_sequence = tokenizer.texts_to_sequences(clean_test)

In [21]:
test_padded = pad_sequences(test_sequence, maxlen =64, padding = 'post')

In [22]:
print(test_padded)
print(type(test_padded))

[[   2 3850    3 ...    0    0    0]
 [ 171 3990  153 ...    0    0    0]
 [ 424 1266    3 ...    0    0    0]
 ...
 [   2   86 1596 ...    0    0    0]
 [ 911  269    4 ...    0    0    0]
 [  22 2532  459 ...    0    0    0]]
<class 'numpy.ndarray'>


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [23]:
model = Sequential()
model.add(Embedding(input_dim = 15420, output_dim = 50))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
#model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation="selu"))
model.add(Dropout(0.2))
model.add(Dense(16, activation="selu"))
model.add(Dense(1, activation="sigmoid"))
earlystopping = EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 5, 
                                        restore_best_weights = True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          771000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          21248     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total para

In [25]:
num_epochs = 10
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_data=(dev_padded, dev_labels), verbose=2,  callbacks = [earlystopping])

Train on 16220 samples, validate on 2027 samples
Epoch 1/10
 - 69s - loss: 0.4959 - acc: 0.7453 - val_loss: 0.3649 - val_acc: 0.8436
Epoch 2/10
 - 61s - loss: 0.2416 - acc: 0.9057 - val_loss: 0.3396 - val_acc: 0.8742
Epoch 3/10
 - 61s - loss: 0.1429 - acc: 0.9491 - val_loss: 0.3736 - val_acc: 0.8722
Epoch 4/10
 - 61s - loss: 0.1003 - acc: 0.9666 - val_loss: 0.3828 - val_acc: 0.8816
Epoch 5/10
 - 62s - loss: 0.0790 - acc: 0.9733 - val_loss: 0.3962 - val_acc: 0.8777
Epoch 6/10
 - 64s - loss: 0.0662 - acc: 0.9772 - val_loss: 0.4195 - val_acc: 0.8712
Epoch 7/10
 - 63s - loss: 0.0597 - acc: 0.9791 - val_loss: 0.4274 - val_acc: 0.8747


In [26]:
output = model.predict_classes(test_padded)

In [31]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [32]:
results = output

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [36]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [37]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [38]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')