In [1]:
import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


### Load Vocab

In [2]:
vocab = {}
idToWord = {}
with open('vocab.txt') as f:
    words = f.read().splitlines()
    for wordIndex in words:
        word, index = wordIndex.split(' -----> ')
        vocab[word] = index
        idToWord[int(index)] = word

### Create input squences

In [3]:
look_back_len = 1000 + 1
sequences = []
vocabulary_size = len(vocab)

with open('int-seq.txt') as f:
    files = f.read().splitlines()
    for file in files:
        numbers = list(map(int, file.split(',')[:-1]))
        #print(numbers)
        for i in range(look_back_len, len(numbers)):
            seq = numbers[i-look_back_len:i]
            sequences.append(seq)
    #print(sequences)

    n_sequences = np.empty([len(sequences), look_back_len], dtype='int32')
    for i in range(len(sequences)):
        n_sequences[i] = sequences[i]

n_sequences

array([[1000,   17,  616, ..., 1000,    4,  389],
       [  17,  616,   17, ...,    4,  389,    0],
       [ 616,   17,  243, ...,  389,    0, 1000],
       ...,
       [  31,  675,    1, ...,    5,   13, 1000],
       [ 675,    1,   23, ...,   13, 1000,   29],
       [   1,   23,   16, ..., 1000,   29,    5]])

### Convert output to one hot encoded vector

In [4]:
train_inputs = n_sequences[:,:-1]
train_targets = n_sequences[:,-1]
print(len(train_targets))

train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]
print(train_targets[0])

38318
[0. 0. 0. ... 0. 0. 0.]


### Train Model

In [5]:
import keras
class CustomSaver(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        self.model.save("model_{}.hd5".format(epoch))

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

saver = CustomSaver()

model.fit(train_inputs,train_targets, epochs = 20 ,verbose=1, validation_split=0.3, callbacks=[saver])
model.save("mymodel.h5")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 1000)        1002000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000, 50)          210200    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 1002)              51102     
Total params: 1,286,052
Trainable params: 1,286,052
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 26822 samples, validate on 11496 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

### Test Model

In [24]:
from keras.preprocessing.sequence import pad_sequences

# sample python input
"""
num = 46
print
"""
encoded_text = [2,13,35,5]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre', value = int(vocab['<pad_token>']))
print(encoded_text, pad_encoded)

print("Top 3 Suggestions:")
for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    print(idToWord[i])

[2, 13, 35, 5] [[36  2 13 35  5]]
Top 3 Suggestions:
num
1
n


In [None]:
def Predict(n_sequences, vocab, idToWord, int_seq_fp):
    train_inputs = n_sequences[:, :-1]
    seq_len = train_inputs.shape[1]
    model = load_model('mymodel.h5')
    # ratio of successful predicted tokens for each input sequence
    score = []
    # top k suggestions
    k = 5
    with open(int_seq_fp) as fp:
        for line in fp:
            seq = [int(x) for x in line.split(',')[:-1]]
            print(seq)
            encoded_text_c1 = []
            encoded_text_c2 = []
            sub_score = 0
            for idx in range(0, len(seq) - 1):
                to_print = []
                if (seq_len - idx) > 0:
                    encoded_text_c1.append(seq[idx])
                    value = int(vocab['<pad_token>'])
                    pad_encoded = pad_sequences([encoded_text_c1], maxlen=seq_len, truncating='pre', value=value)
                    for x in pad_encoded[0]:
                        to_print.append(idToWord[x])
                    print("Input sequence: ", to_print)
                else:
                    encoded_text_c2 = [seq[idx-4], seq[idx-3], seq[idx-2], seq[idx-1], seq[idx]]
                    pad_encoded = pad_sequences([encoded_text_c2], maxlen=seq_len, truncating='pre', value=value)
                    for x in pad_encoded[0]:
                        to_print.append(idToWord[x])
                    print("Input sequence: ", to_print)

                top_k = []
                top_k_id = []
                for token_id in (model.predict(pad_encoded)[0]).argsort()[-k:][::-1]:
                    #print(idToWord[token_id])
                    top_k.append(idToWord[token_id])
                    top_k_id.append(token_id)

                print("Top ", k, " Suggestions: ", top_k_id)
                next_token = seq[idx + 1]
                print("ground truth", next_token)

                if next_token in top_k_id:
                    sub_score += 1
                print("\n")
            score.append(sub_score/len(seq))
    print("accuracy for prediction for each code file")
    print(score)
