# DeepFix : Deep learning for automatically fixing single-line syntax errors in C programs (using Encoder-Decoder model approach.) 

Import required libraries ---

In [1]:
#from keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, LSTM, Dense , Bidirectional ,  Concatenate
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from operator import itemgetter
import pandas as pd
import pickle 
import tensorflow as tf
import numpy as np
import csv

from customfunction import text2sequence , sequence2tokenization , encoder_decoder_data


# Load the data & Build vocabulary on traning data ---

Dataset Description :

Each row of the dataset is buggy C program line and 'target' C line which represents fixed C program line.

In [2]:
data=pd.read_csv('G:/IISc/2nd-semester/ASE with ML/Assignment/DeepFixLite/train.csv')

input_texts = data['sourceLineTokens']
target_texts = data['targetLineTokens']

input_lines , target_lines , token2count = text2sequence(input_texts , target_texts)

(1)Setting parameters such as sentence length, vocabulory size = num_unique_tokens.

(2) Select top 1000 most frequent words and construct vocabulary only for that many words. 

(3) Download Dictionary.

In [4]:
k=1000 # select top 1000 words(most frequent)
sent_length=70
num_unique_tokens = k + 4
num_samples = len(input_lines)

token_with_count = dict(sorted(token2count.items(), key = itemgetter(1), reverse = True)[:k])
token2index = {"PAD":0, "SOS":1, "EOS":2, "OOV":3}
index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}
num_count = 4
for i in token_with_count:
  token2index[i] = num_count
  index2token[num_count] = i
  num_count += 1

pickle_out1 = open("token2index.pickle" , "wb")
pickle_out2 = open("index2token.pickle" , "wb")

pickle.dump(token2index , pickle_out1)
pickle.dump(index2token , pickle_out2)

pickle_out1.close()
pickle_out2.close()

In [5]:
max_encoder_seq_length = max([len(eval(item)) for item in input_lines]) #154
max_decoder_seq_length = max([len(eval(item)) for item in target_lines]) #169

print("size of the original vocabulary =",len(token2count))
print("size of the updated vocabulary =",len(token_with_count))
print("max_encoder_seq_length =",max_encoder_seq_length)
print("max_decoder_seq_length =",max_decoder_seq_length)
print("number of samples =",num_samples )


size of the original vocabulary = 5205
size of the updated vocabulary = 1000
max_encoder_seq_length = 154
max_decoder_seq_length = 169
number of samples = 14643


# Generating fixed size sequences which can be given input to neural networks

(1) Tokenise input sequence 

(2) Add padding to make equal size sequence  

Above two functionalities implemented in sequence2tokenization function. 

In [6]:
padded_input_texts = sequence2tokenization(input_lines , token2index , sent_length)
padded_target_texts = sequence2tokenization(target_lines , token2index , sent_length)

print(padded_input_texts[0])

[ 1 21  4 82 48  8 42 11 17  5  2  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


# Encoder Input data, decoder input data are inputs that need to be given to LSTM's networks for encoding and decoding.

In [7]:
encoder_input_data , decoder_input_data , decoder_target_data = encoder_decoder_data(padded_input_texts , padded_target_texts ,num_samples ,sent_length ,num_unique_tokens )


# Pre_process the validation data(same step as we did previously on training data) for parameter tunning.

In [9]:
valid_data=pd.read_csv('G:/IISc/2nd-semester/ASE with ML/Assignment/DeepFixLite/valid_complete.csv')

input_valid_texts = valid_data['sourceLineTokens']
target_valid_texts = valid_data['targetLineTokens']
num_valid_samples = len(input_valid_texts)
padded_input_valid_texts = sequence2tokenization(input_valid_texts , token2index , sent_length)
padded_target_valid_texts = sequence2tokenization(target_valid_texts , token2index , sent_length)

encoder_input_valid_data ,decoder_input_valid_data ,decoder_target_valid_data = encoder_decoder_data(padded_input_valid_texts , padded_target_valid_texts ,num_valid_samples ,sent_length ,num_unique_tokens )



# Design Encoder-decoder model.

In [8]:
latent_dim=256
batch_size=64
epochs = 10
emb_size = 50


#Training Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding=  Embedding(num_unique_tokens, emb_size)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c =encoder_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

#Training Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding=  Embedding(num_unique_tokens, emb_size)
final_dex= decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex,initial_state=encoder_states)
decoder_dense = Dense(num_unique_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     50200       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 512), (None, 628736      embedding[0][0]                  
______________________________________________________________________________________________

# Train the model.

In [None]:
model.fit(
    [encoder_input_data, decoder_input_data],decoder_target_data,
    batch_size=batch_size,
    epochs=20,
    #validation_split=0.2,
    validation_data=([encoder_input_valid_data,decoder_input_valid_data],decoder_target_valid_data)
)

In [None]:
#save the model
tf.keras.models.save_model(model, "G:/IISc/2nd-semester/ASE with ML/Assignment/DeepFixLite/BD_LSTM_model")