In [38]:
import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

### Load Vocab

In [39]:
vocab = {}
idToWord = {}
with open('ast_vocab.txt') as f:
    words = f.read().splitlines()
    for wordIndex in words:
        print(wordIndex)
        word, index = wordIndex.split(' -----> ')
        vocab[word] = index
        idToWord[int(index)] = word

NameLoad -----> 0
attr -----> 1
AttributeLoad -----> 2
Str -----> 3
Call -----> 4
self -----> 5
Assign -----> 6
NameStore -----> 7
body -----> 8
Num -----> 9
Expr -----> 10
NameParam -----> 11
args -----> 12
decorator_list -----> 13
arguments -----> 14
defaults -----> 15
FunctionDef -----> 16
keyword -----> 17
If -----> 18
Index -----> 19
SubscriptLoad -----> 20
TupleLoad -----> 21
Return -----> 22
ListLoad -----> 23
AttributeStore -----> 24
0 -----> 25
alias -----> 26
Dict -----> 27
None -----> 28
1 -----> 29
True -----> 30
name -----> 31
orelse -----> 32
CompareEq -----> 33
BinOpAdd -----> 34
ImportFrom -----> 35
BinOpMod -----> 36
 -----> 37
For -----> 38
bases -----> 39
False -----> 40
ClassDef -----> 41
SubscriptStore -----> 42
value -----> 43
2 -----> 44
data -----> 45
type -----> 46
assertEqual -----> 47
UnaryOpNot -----> 48
path -----> 49
TupleStore -----> 50
kwargs -----> 51
x -----> 52
len -----> 53
Import -----> 54
os -----> 55
get -----> 56
result -----> 57
request -----> 5

### Create input squences

In [40]:
look_back_len = 1000 + 1
sequences = []
vocabulary_size = len(vocab)

with open('int-seq-ast.txt') as f:
    files = f.read().splitlines()
    for file in files:
        numbers = list(map(int, file.split(',')[:-1]))
        #print(numbers)
        for i in range(look_back_len, len(numbers)):
            seq = numbers[i-look_back_len:i]
            sequences.append(seq)
    #print(sequences)

    n_sequences = np.empty([len(sequences), look_back_len], dtype='int32')
    for i in range(len(sequences)):
        n_sequences[i] = sequences[i]

n_sequences

array([[  81,   10,    3, ..., 1000,    0, 1000],
       [  10,    3, 1000, ...,    0, 1000,    1],
       [   3, 1000,   54, ..., 1000,    1,  931],
       ...,
       [   0,   40,    8, ...,  600,    8,   22],
       [   0,   45,   13, ...,    7, 1000,    0],
       [  45,   13,   13, ..., 1000,    0,   30]])

### Convert output to one hot encoded vector

In [42]:
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]
print(len(train_targets))

train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]
print(train_targets[0])

36234
[0. 0. 0. ... 0. 1. 0.]


### Train Model

In [44]:
import keras
class CustomSaver(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        self.model.save("model_{}.hd5".format(epoch))

In [46]:
model = Sequential()
model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())
# compile network

saver = CustomSaver()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_inputs,train_targets,epochs=10,verbose=1, validation_split=0.3, callbacks=[saver])
model.save("mymodel.h5")

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1000, 1000)        1002000   
_________________________________________________________________
lstm_15 (LSTM)               (None, 1000, 50)          210200    
_________________________________________________________________
lstm_16 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_15 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_16 (Dense)             (None, 1002)              51102     
Total params: 1,286,052
Trainable params: 1,286,052
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25363 samples, validate on 10871 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Test Model