In [175]:
with open('./cipher.txt','r') as labels:
    for l in labels.readlines()[:2]:
        print(l)

with open('./plaintext.txt') as features:
    for l in features.readlines()[:2]:
        print(l)

YMJ QNRJ NX MJW QJFXY QNPJI KWZNY , GZY YMJ GFSFSF NX RD QJFXY QNPJI .

MJ XFB F TQI DJQQTB YWZHP .

THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .

HE SAW A OLD YELLOW TRUCK .



In [176]:
#loading data
import os
def load_data(path):
    file = os.path.join(path)
    with open (file,'r') as f:
        data=f.read()
    return data.split('\n')

In [177]:
codes = load_data('cipher.txt')
features = load_data('plaintext.txt')

In [178]:
codes[:5]

['YMJ QNRJ NX MJW QJFXY QNPJI KWZNY , GZY YMJ GFSFSF NX RD QJFXY QNPJI .',
 'MJ XFB F TQI DJQQTB YWZHP .',
 'NSINF NX WFNSD IZWNSL OZSJ , FSI NY NX XTRJYNRJX BFWR NS STAJRGJW .',
 'YMFY HFY BFX RD RTXY QTAJI FSNRFQ .',
 'MJ INXQNPJX LWFUJKWZNY , QNRJX , FSI QJRTSX .']

In [179]:
features[:5]

['THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .',
 'HE SAW A OLD YELLOW TRUCK .',
 'INDIA IS RAINY DURING JUNE , AND IT IS SOMETIMES WARM IN NOVEMBER .',
 'THAT CAT WAS MY MOST LOVED ANIMAL .',
 'HE DISLIKES GRAPEFRUIT , LIMES , AND LEMONS .']

In [180]:
from tensorflow.keras.preprocessing.text import Tokenizer

def tokenize(text):
    x_tk = Tokenizer(char_level=True)
    x_tk.fit_on_texts(text)
    
    return x_tk.texts_to_sequences(text),x_tk


In [181]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

In [182]:
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print('\n')



for i,(f,c) in enumerate(zip(text_sentences,text_tokenized)):
    print('sentence{}'.format(i+1))
    print('\n')
    print('original sentence: {}'.format(f))
    print('embedded: {}'.format(c))

{' ': 1, 'e': 2, 'o': 3, 't': 4, 'i': 5, 's': 6, 'h': 7, 'r': 8, 'y': 9, 'u': 10, 'c': 11, 'n': 12, 'a': 13, 'p': 14, '.': 15, 'q': 16, 'k': 17, 'b': 18, 'w': 19, 'f': 20, 'x': 21, 'j': 22, 'm': 23, 'v': 24, 'l': 25, 'z': 26, 'd': 27, 'g': 28, ',': 29}


sentence1


original sentence: The quick brown fox jumps over the lazy dog .
embedded: [4, 7, 2, 1, 16, 10, 5, 11, 17, 1, 18, 8, 3, 19, 12, 1, 20, 3, 21, 1, 22, 10, 23, 14, 6, 1, 3, 24, 2, 8, 1, 4, 7, 2, 1, 25, 13, 26, 9, 1, 27, 3, 28, 1, 15]
sentence2


original sentence: By Jove , my quick study of lexicography won a prize .
embedded: [18, 9, 1, 22, 3, 24, 2, 1, 29, 1, 23, 9, 1, 16, 10, 5, 11, 17, 1, 6, 4, 10, 27, 9, 1, 3, 20, 1, 25, 2, 21, 5, 11, 3, 28, 8, 13, 14, 7, 9, 1, 19, 3, 12, 1, 13, 1, 14, 8, 5, 26, 2, 1, 15]
sentence3


original sentence: This is a short sentence .
embedded: [4, 7, 5, 6, 1, 5, 6, 1, 13, 1, 6, 7, 3, 8, 4, 1, 6, 2, 12, 4, 2, 12, 11, 2, 1, 15]


In [183]:
#padding
from tensorflow.keras.preprocessing.sequence import pad_sequences 

def padding(seq,l=None):
    padded = pad_sequences(
    seq,
    maxlen=l,
    padding='post')
    return padded

In [184]:
text_tokenized = padding(text_tokenized)
text_tokenized[0]

array([ 4,  7,  2,  1, 16, 10,  5, 11, 17,  1, 18,  8,  3, 19, 12,  1, 20,
        3, 21,  1, 22, 10, 23, 14,  6,  1,  3, 24,  2,  8,  1,  4,  7,  2,
        1, 25, 13, 26,  9,  1, 27,  3, 28,  1, 15,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)

In [187]:
def preprocess(x,y):
    x_preprocessed, x_tokenizer = tokenize(x)
    y_preprocessed, y_tokenizer = tokenize(y)
    
    x_preprocessed = padding(x_preprocessed)
    y_preprocessed = padding(y_preprocessed)
    
    x_preprocessed = x_preprocessed.reshape(*x_preprocessed.shape,1)
    y_preprocessed = y_preprocessed.reshape(*y_preprocessed.shape,1)
    
    return x_preprocessed,y_preprocessed,x_tokenizer,y_tokenizer
    

In [188]:
features,labels,x_tr,y_tr = preprocess(features,codes)
print(x_tr.word_index)
print(y_tr.word_index)


{' ': 1, 'i': 2, 'e': 3, 's': 4, 't': 5, 'r': 6, 'n': 7, 'a': 8, 'u': 9, 'l': 10, 'd': 11, 'o': 12, 'm': 13, 'h': 14, 'y': 15, 'g': 16, 'b': 17, ',': 18, '.': 19, 'f': 20, 'p': 21, 'c': 22, 'v': 23, 'w': 24, 'k': 25, 'j': 26, 'x': 27, 'q': 28, 'z': 29, '?': 30, "'": 31}
{' ': 1, 'n': 2, 'j': 3, 'x': 4, 'y': 5, 'w': 6, 's': 7, 'f': 8, 'z': 9, 'q': 10, 'i': 11, 't': 12, 'r': 13, 'm': 14, 'd': 15, 'l': 16, 'g': 17, ',': 18, '.': 19, 'k': 20, 'u': 21, 'h': 22, 'a': 23, 'b': 24, 'p': 25, 'o': 26, 'c': 27, 'v': 28, 'e': 29, '?': 30, "'": 31}


In [189]:
#model architecture
from keras.models import Model
from keras.layers import Dense,GRU,Input, TimeDistributed
from keras.layers import Activation
from keras.losses import sparse_categorical_crossentropy

def simple_mdel(input_shape,output_seq_length,code_vocab_size,features_vocab_size):
    inputs = Input(input_shape[1:])
    layer1 = GRU(100,return_sequences=True)(inputs)
    #utputs = Dense(features_vocab_size)(layer1)
    outputs = TimeDistributed(Dense(features_vocab_size))(layer1)
    output = Activation('softmax')(outputs)
    model = Model(inputs,output)
    model.compile(optimizer="Adam", loss=sparse_categorical_crossentropy, metrics=["acc"])
    return model
    

In [190]:
model = simple_mdel(features.shape,labels.shape[1],len(x_tr.word_index)+1,len(y_tr.word_index)+1)
model.fit(features,labels,batch_size=64,epochs=8,validation_split=0.2)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f498c1492e0>

In [199]:
#convert logits to char
def convertion(pred,tokenizer):
    idx_to_char = {i:char for char,i in tokenizer.word_index.items()}
    idx_to_char[0] = '    '
    conv = ''.join(idx_to_char[prediction] for prediction in np.argmax(pred,axis=1))
    conv.replace('<PAD>', "1")
        
    return conv



In [200]:
pred = model.predict(features[:1])[0]
print(convertion(pred,x_tr))



the uime is her least liked fruit , but the banana is my least liked .                                                                                                                            


In [205]:
original_text = load_data('plaintext.txt')[0]
print(original_text)

THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .
