In [24]:
import tensorflow as tf
# import tensorflow_addons as tfa
import keras
import pandas as pd
import pickle
import string
import numpy as np
from gensim.models import KeyedVectors as word2vec
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [4]:
vectors_text_path = '../data/target_vecs.txt'
all_funcs_data_path = '../data/bcb_funcs_all.tsv'
pairs_id_path = '../data/bcb_pair_ids.pkl'

In [5]:
embeddings_dim = 384
max_sequence_length = 32
output_dim = 6

In [6]:
all_funcs = pd.read_csv(all_funcs_data_path, delimiter="\t",header=None)
with open(pairs_id_path, 'rb') as f:
    pair_ids = pickle.load(f).to_numpy()

In [7]:
train_data, test_data = train_test_split(pair_ids, test_size=0.2, random_state=42, stratify=pair_ids[:,2]) 

In [8]:
code2vec = word2vec.load_word2vec_format(vectors_text_path, binary=False)

In [9]:
def preproces(s):
    s = s.lower()
    s = s.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    s = s.split(" ")
    final = [0 for _ in range(max_sequence_length)]
    counter = 0
    for word in s:
        word = word.strip()
        if len(word) > 0 and word in code2vec:
            final[counter] = code2vec.vocab[word].index
            counter += 1
        
        if counter >= max_sequence_length:
            break
    return final

In [10]:
functions = dict()
for index, row in all_funcs.iterrows():
    processed_function = preproces(row[1])
    functions[int(row[0])] = processed_function

In [11]:
def get_keras_dataset(data):
    x = [[],[]]
    y = []
    for id1,id2,label in data:
        try:
            x[0].append(functions[id1])
            x[1].append(functions[id2])
            y.append(label)
        except KeyError:
            continue
    x = np.array(x)
    print(x.shape)
    x = [x[0,:],x[1,:]]

    y = to_categorical(y)
    print(y.shape)
    
    return x, y

In [12]:
x,y  = get_keras_dataset(train_data)
test_x, test_y = get_keras_dataset(test_data)

(2, 78013, 32)
(78013, 6)
(2, 19502, 32)
(19502, 6)


In [13]:
from keras.layers import *
from keras.models import *

In [30]:
embedding_layer = code2vec.get_keras_embedding(train_embeddings=False)


lstm_layer = LSTM(embeddings_dim, dropout=0.3, recurrent_dropout=0.3)

in_1 = Input(shape=(max_sequence_length,), dtype='int32')
emb_1 = embedding_layer(in_1)
lstm_1 = lstm_layer(emb_1)

in_2 = Input(shape=(max_sequence_length,), dtype='int32')
emb_2 = embedding_layer(in_2)
lstm_2 = lstm_layer(emb_2)


merged = concatenate([lstm_1, lstm_2])
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)
merged = Dense(512, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)
preds = Dense(output_dim, activation='softmax')(merged)

model = Model(inputs=[in_1, in_2], outputs=preds)

In [31]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['acc',tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [32]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 32, 384)      117451392   input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 384)          1181184     embedding_2[0][0]          

In [38]:
model.fit(x,
          y,
          initial_epoch=20,
          epochs=25,
          validation_split=0.2,
          batch_size=64,
          shuffle=True)

Train on 62410 samples, validate on 15603 samples
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.callbacks.History at 0x7f575c2e4630>

In [39]:
model.evaluate(test_x, test_y)



[0.4632954556182059, 0.8619115948677063, 0.831502377986908, 0.8560125827789307]