# Deciphering Code with Character-Level RNN





## Dataset

In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('../data/DES.csv')
df

Unnamed: 0,Passwords,ciphertext,key
0,2428031609,6960043906fcfa4152225d4cbeed92c9,C5SZWCOV
1,4531040045,bdbffdd0a1df14090eec745de30a3c0b,KFD7SI1Q
2,almigrana1,ad65ac507a1184ceb9a42afc48482a9f,KFD7SI1Q
3,quiero95,8b812dcbced662658507d26dfea58f83,CFJT97ZI
4,doitnow2,590c4cce603fc18346b266f4d0805116,40H2ETAQ
...,...,...,...
199995,dime168,b0e07c1a6af86c4c,Q0GM6RKL
199996,!l0v3k3v!n,45820d4b54019ec2eca8361c510392d3,1V3YX0JE
199997,9870tmf05,f8a9a57f8b5e5843c94911676c7151f3,KFD7SI1Q
199998,CAMILA,cceedf72faf67106,Y7HFVYA2


In [3]:
for i in range(len(df['Passwords'])):
    if len(df['Passwords'][i]) > 20:
        df.drop(i, axis=0, inplace=True)

In [4]:
df = df.dropna()[:120000].reset_index(drop=True)

In [5]:
df['Passwords'].apply(str)
df['ciphertext'].apply(str)
df

Unnamed: 0,Passwords,ciphertext,key
0,2428031609,6960043906fcfa4152225d4cbeed92c9,C5SZWCOV
1,4531040045,bdbffdd0a1df14090eec745de30a3c0b,KFD7SI1Q
2,almigrana1,ad65ac507a1184ceb9a42afc48482a9f,KFD7SI1Q
3,quiero95,8b812dcbced662658507d26dfea58f83,CFJT97ZI
4,doitnow2,590c4cce603fc18346b266f4d0805116,40H2ETAQ
...,...,...,...
119995,6942498317,4f0c7a76a5375695210c5bc15fa7929f,D36HDVGV
119996,babycake00,7eac1a57a3e4e0a20e3be95ef52359b6,Y7HFVYA2
119997,m91485,576a4cff2974059e,87I3XHJV
119998,5215hooker,92d76b6a186238d0ef447d18a9ada7e1,GN4Z7EAC


## Preprocessing Data

In [6]:
def tokenize(x):
    x_tk = Tokenizer(char_level=True)
    x_tk.fit_on_texts(x)                 
    return x_tk.texts_to_sequences(x), x_tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding="post", truncating="post",)

### Preprocess Pipeline

In [7]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [8]:
preproc_code_sentences, preproc_plaintext_sentences, code_tokenizer, plaintext_tokenizer = preprocess(df['Passwords'], df['ciphertext'])

In [9]:
preproc_code_sentences[0]

array([ 7, 17,  7, 15,  5, 13,  3, 19,  5, 12,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])

In [10]:
len(code_tokenizer.word_index)+1

126

In [11]:
len(plaintext_tokenizer.word_index)+1

17

In [12]:
plaintext_tokenizer.word_index

{'5': 1,
 'd': 2,
 '0': 3,
 'c': 4,
 '6': 5,
 '9': 6,
 '4': 7,
 '7': 8,
 '3': 9,
 'e': 10,
 'b': 11,
 '1': 12,
 'a': 13,
 '2': 14,
 '8': 15,
 'f': 16}

In [13]:
preproc_code_sentences.shape

(120000, 20)

In [14]:
preproc_plaintext_sentences.shape

(120000, 64, 1)

# LSTM

In [15]:
from keras.layers import GRU, Input, Dense, TimeDistributed, LSTM
from keras.models import Model, Sequential
from keras.layers import Activation
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import tensorflow


def lstm(input_shape, output_sequence_length, code_vocab_size, plaintext_vocab_size):
    x = Input(shape=input_shape[1:])   
    seq = LSTM(units= 128, return_sequences = True, name='Layer1')(x)
    seq = LSTM(units= 128, return_sequences = True, name='Layer2')(seq)
    output = TimeDistributed(Dense(units = plaintext_vocab_size, activation='softmax', name='Layer3'))(seq)
    model = Model(inputs = x, outputs = output)
    model.compile(optimizer='adam', loss=sparse_categorical_crossentropy, metrics=['accuracy'])
    model.summary()
    return model

tmp_x = pad(preproc_code_sentences, preproc_plaintext_sentences.shape[1]) 
tmp_x = tmp_x.reshape((-1, preproc_plaintext_sentences.shape[-2], 1))     

In [16]:
tmp_x.shape

(120000, 64, 1)

In [17]:
lstm_model = lstm(
    tmp_x.shape,
    preproc_plaintext_sentences.shape[1],
    len(code_tokenizer.word_index)+1,
    len(plaintext_tokenizer.word_index)+1)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64, 1)]           0         
_________________________________________________________________
Layer1 (LSTM)                (None, 64, 128)           66560     
_________________________________________________________________
Layer2 (LSTM)                (None, 64, 128)           131584    
_________________________________________________________________
time_distributed (TimeDistri (None, 64, 17)            2193      
Total params: 200,337
Trainable params: 200,337
Non-trainable params: 0
_________________________________________________________________


In [18]:
lstm_model.fit(tmp_x, preproc_plaintext_sentences, batch_size=512, epochs=15, validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x155a3b71df0>

In [25]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ''
    return ''.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

def getPred(model, x, n):
    print('Predicted\t\t\t\tActual')
    for i in range(n):
        print(logits_to_text(model.predict([x[:n]])[i], plaintext_tokenizer), end='\t')
        print(df['Passwords'][i])

In [26]:
getPred(lstm_model, tmp_x, 5)

Predicted				Actual
b0441cadda3deebde555cccceeeeeeec	2428031609
2da919cdda9deebde555cccceeeeeeec	4531040045
47053ccad8cdee7d9555cccceeeeeeec	almigrana1
33ac12cddc1de5c99aaaeeeeeeeeee5c	quiero95
3b491fcddc1de5c99aaaeeeeeeeeee5c	doitnow2


# GRU

In [21]:
def gru(input_shape, output_sequence_length, code_vocab_size, plaintext_vocab_size):
    x = Input(shape=input_shape[1:])   
    seq = GRU(units= 128, return_sequences = True, name='Layer1')(x)
    seq = GRU(units= 128, return_sequences = True, name='Layer2')(seq)
    output = TimeDistributed(Dense(units = plaintext_vocab_size, activation='softmax', name='Layer3'))(seq)
    model = Model(inputs = x, outputs = output)
    model.compile(optimizer='adam', loss= sparse_categorical_crossentropy, metrics=['accuracy'])
    model.summary()
    return model

In [22]:
gru_model = gru(
    tmp_x.shape,
    preproc_plaintext_sentences.shape[1],
    len(code_tokenizer.word_index)+1,
    len(plaintext_tokenizer.word_index)+1)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 64, 1)]           0         
_________________________________________________________________
Layer1 (GRU)                 (None, 64, 128)           50304     
_________________________________________________________________
Layer2 (GRU)                 (None, 64, 128)           99072     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 64, 17)            2193      
Total params: 151,569
Trainable params: 151,569
Non-trainable params: 0
_________________________________________________________________


In [23]:
gru_model.fit(tmp_x, preproc_plaintext_sentences, batch_size=512, epochs=15, validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x156821d3e50>

In [27]:
getPred(gru_model, tmp_x, 5)

Predicted				Actual
00eee444000bb444407777770033d32d	2428031609
0ee044bb000bb440007777770033d32d	4531040045
700e044b4004b4444777777700333334	almigrana1
000000004044ee40d00dd3333333aa34	quiero95
0400040000eee117d00dd3333333aa34	doitnow2
