In [1]:
%pip install -r requirements.txt --user

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 23.1.2
[notice] To update, run: C:\Users\mosas\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip


In [26]:
import pandas as pd
import pickle
import numpy as np
from math import log2

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model
from keras.utils.vis_utils import plot_model


In [27]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)
# for later versions:
# tf.compat.v1.set_random_seed(seed_value)


In [28]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

In [29]:
def top_3(y_true, y_pred):
    true = 0
    length = len(y_pred)
    if length == 0:
        return 0
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            if j==3:
                continue
            if y_pred[i][j] == y_true[i]:
                true+=1
    return true/length

In [30]:
def nDCG(y_true, y_pred):
    relevance = []
    length = len(y_pred)
    if length == 0:
        return 0
    for i in range(len(y_pred)):
        relevance.append([])
        for j in range(3):
            if j >= len(y_pred[i]):
                relevance[i].append(0)
            elif y_pred[i][j] == y_true[i]:
                relevance[i].append(1)
            else:
                relevance[i].append(0)
    ndcg = 0
    for element in relevance:
        ideal = element.copy()
        ideal.sort(reverse=True)
        dcg = 0
        idcg = 0
        for i in range(len(element)):
            dcg += element[i]/log2(i+2)
            idcg += ideal[i]/log2(i+2)
        if idcg != 0:
            ndcg += dcg/idcg
    ndcg /= len(relevance)
    return ndcg

In [31]:
size = ["small","medium","big"]

In [32]:
for name in size:
    print("size: ",name)
    data = pd.read_csv(f'code_completion_lib/methods/models/small/data.csv')
    X = data[["varible_name"]]
    for element in X.values:
        element[0]=element[0].replace(".", " ").replace("_", " ").replace(",", " ").replace("[", " ").replace("]", " ").replace("'", " ").replace('"', ' ')
    y = data[["method"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    s = pd.Series(data=X_train["varible_name"]+" " + y_train["method"])

    tokenizer = Tokenizer(oov_token='<oov>',split=" ", filters='!',lower=False) # For those words which are not found in word_index
    tokenizer.fit_on_texts(s)
    total_words = len(tokenizer.word_index) + 1
    #print("Total number of words: ", total_words)


    input_sequences = []
    for line in s:
        token_list = tokenizer.texts_to_sequences([line])[0]
        input_sequences.append(token_list)

    #print("Total input sequences: ", len(input_sequences))
    # pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create features and label
    X = input_sequences[:,:-1]
    labels = input_sequences[:,-1]
    y = to_categorical(labels, num_classes=total_words)

    # saving the tokenizer for predict function.
    pickle.dump(tokenizer, open(f'{name}.pkl', 'wb'))

    model = Sequential()
    model.add(Embedding(total_words, 200, input_length=max_sequence_len - 1))
    model.add(LSTM(500, return_sequences=True))
    model.add(LSTM(500))
    model.add(Dense(500, activation="relu"))
    model.add(Dense(total_words, activation="softmax"))
    print(model.summary())

    checkpoint = ModelCheckpoint(f"{name}.h5", monitor='loss', verbose=1,
                                 save_best_only=True, mode='auto')

    reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

    logdir='logsnextword1'
    tensorboard_Visualization = TensorBoard(log_dir=logdir)

    model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.01))

    model.fit(X, y, epochs=150, callbacks=[checkpoint, reduce, tensorboard_Visualization])


    model = load_model(f'{name}.h5')
    tokenizer = pickle.load(open(f'{name}.pkl', 'rb'))
    y_pred_top1 = []
    y_pred_top3 = []
    y_true = y_test['method'].values.tolist()
    x_for_pred = X_test['varible_name'].values.tolist()
    for i in range(X_test.shape[0]):
        token_list = tokenizer.texts_to_sequences([x_for_pred[i]])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predict_x=model.predict(token_list,verbose=0)
        f = predict_x[0].copy()
        f.sort()
        a = np.flip(f)
        max_index = []
        for i in range(len(predict_x[0])):
            if predict_x[0][i]== a[0] or predict_x[0][i]== a[1] or predict_x[0][i]== a[2]:
                max_index.append(i)
        classes_x=np.argmax(predict_x,axis=1)
        max_index = max_index[:3]
        result = []
        for j in range(3):
            for word, index in tokenizer.word_index.items():
                if index == max_index[j]:
                    result.append(word)
                    break
        y_pred_top1.append(result[0])
        y_pred_top3.append(result)

    acc_top1 = accuracy_score(y_true, y_pred_top1)
    acc_top3 = top_3(y_true, y_pred_top3)
    ndcg = nDCG(y_true, y_pred_top3)

    print(f"top 1 accuracy: {acc_top1}")
    print(f"top 3 accuracy: {acc_top3}")
    print(f"ndcg: {ndcg}\n")
    print("-----------------------------------------------------------------------------------------------------------")
    break

size:  small
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 8, 200)            195000    
                                                                 
 bidirectional (Bidirectiona  (None, 1000)             2804000   
 l)                                                              
                                                                 
 dense_18 (Dense)            (None, 500)               500500    
                                                                 
 dense_19 (Dense)            (None, 975)               488475    
                                                                 
Total params: 3,987,975
Trainable params: 3,987,975
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/150
Epoch 1: loss improved from inf to 5.89585, saving model to small.h5
Epoch 2/150