In [5]:
%pip install -r requirements.txt --user

^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from code_completion_lib.imports.imports import Imports
from code_completion_lib.parse_notebooks import Parser
from code_completion_lib.methods.find_methods_in_code import Methods
from code_completion_lib.logger.logger import Logger
from code_completion_lib.code_completion import CodeCompletion
import os
from math import log2

In [2]:
logger = Logger(__name__, mode="a")
logger.info("-----------------------------------------------------")

In [3]:
size = ["small","medium","big"]
path = "C:\data\data_parsed"

In [4]:
try:
    parcer = Parser(r'C:\data\notebooks', r'C:\data\data_parsed', logger=logger)
    parcer.parse()
    parcer.check_language()
except Exception:
    logger.error("Exception")


KeyboardInterrupt: 

In [None]:
for name in size:
    try:
        imports = Imports(os.path.join(path, name),size=name,logger=logger)
        imports.process()
    except Exception:
        logger.error("Exception")

In [None]:
for name in size:
    try:
        completion = CodeCompletion(size=name, logger=logger)
        completion.import_clusterization()
    except Exception:
        logger.error("Exception")

In [4]:
for name in size:
    model_path = rf'code_completion_lib\models\{name}'
    try:
        methods = Methods(os.path.join(path, name),size=name, logger=logger)
        methods.find_methods(model_path)
    except Exception:
            logger.error("Exception")
    break

KeyboardInterrupt: 

In [7]:
def top_3(y_true, y_pred):
    true = 0
    length = len(y_pred)
    if length == 0:
        return 0
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            if j==3:
                continue
            if y_pred[i][j] == y_true[i]:
                true+=1
    return true/length

In [8]:
def nDCG(y_true, y_pred):
    relevance = []
    length = len(y_pred)
    if length == 0:
        return 0
    for i in range(len(y_pred)):
        relevance.append([])
        for j in range(3):
            if j >= len(y_pred[i]):
                relevance[i].append(0)
            elif y_pred[i][j] == y_true[i]:
                relevance[i].append(1)
            else:
                relevance[i].append(0)
    ndcg = 0
    for element in relevance:
        ideal = element.copy()
        ideal.sort(reverse=True)
        dcg = 0
        idcg = 0
        for i in range(len(element)):
            dcg += element[i]/log2(i+2)
            idcg += ideal[i]/log2(i+2)
        if idcg != 0:
            ndcg += dcg/idcg
    ndcg /= len(relevance)
    return ndcg

In [7]:
dataset = pd.read_csv(rf'code_completion_lib\methods\models\small\data_test.csv')
length = dataset.shape[0]
df1 = dataset.iloc [:int(0.7*length)]
df2 = dataset.iloc [int(0.7*length):int(0.9*length)]
df3 = dataset.iloc [int(0.9*length):]
df1.to_csv(r'code_completion_lib\data_train.csv',index=False)
df2.to_csv(r'code_completion_lib\data_valid.csv',index=False)
df3.to_csv(r'code_completion_lib\data_test.csv',index=False)

In [9]:
import time
for name in size:
    print(f"size: {name}")
    dataset = pd.read_csv(rf'code_completion_lib\methods\models\{name}\data_test.csv')
    for line in dataset.values:
        line[1] = line[1].split("(",1)[0]
    models = dataset.keys()[2:]
    for model in models:
        completion_with_clusters = []
        print(model)
        try:
            X = dataset[["varible_name", model]]
            y = dataset[["method"]]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
            completion = CodeCompletion(size=name, logger=logger)
            completion.relations_variable_with_method(X_train, y_train, model=model)
            completion.relations_cluster_with_variable(X_train, y_train, model=model)
            completion.default_task(X_train, y_train)


            y_pred_top1 = []
            y_pred_top3 = []
            default_y_pred_top1 = []
            default_y_pred_top3 = []
            y_true = y_test['method'].values.tolist()
            for index in range(X_test.shape[0]):
                cluster = X_test.values[index][1]
                variable_name = X_test.values[index][0]

                f_completion = completion.get_function_completion(model=model, variable_name=variable_name,cluster=cluster,number=3)
                default_f_completion = completion.get_default_function_completion(variable_name=variable_name, number=3)


                completion_with_clusters.append([cluster, f_completion])

                y_pred_top1.append(f_completion[0])
                y_pred_top3.append(f_completion)
                default_y_pred_top1.append(default_f_completion[0])
                default_y_pred_top3.append(default_f_completion)


            acc_top1 = accuracy_score(y_true, y_pred_top1)
            acc_top3 = top_3(y_true, y_pred_top3)
            ndcg = nDCG(y_true, y_pred_top3)

            default_acc_top1 = accuracy_score(y_true, default_y_pred_top1)
            default_acc_top3 = top_3(y_true, default_y_pred_top3)
            default_ndcg = nDCG(y_true, default_y_pred_top3)

            print(f"top 1 accuracy: {acc_top1}")
            print(f"top 3 accuracy: {acc_top3}")
            print(f"ndcg: {ndcg}\n")

            print(f"default top 1 accuracy: {default_acc_top1}")
            print(f"default top 3 accuracy: {default_acc_top3}")
            print(f"default ndcg: {default_ndcg}\n")

            clusters = {}
            true_cl = {}

            clusters_top3 = {}

            for index in range(X_test.shape[0]):
                clusters[completion_with_clusters[index][0]] = []
                true_cl[completion_with_clusters[index][0]] = []
                clusters_top3[completion_with_clusters[index][0]] = []

            for index in range(X_test.shape[0]):

                tmp = clusters[completion_with_clusters[index][0]]
                tmp.append(y_pred_top1[index])
                clusters[completion_with_clusters[index][0]] = tmp

                tmp_true = true_cl[completion_with_clusters[index][0]]
                tmp_true.append(y_true[index])
                true_cl[completion_with_clusters[index][0]] = tmp_true


                tmp_top3 = clusters_top3[completion_with_clusters[index][0]]
                tmp_top3.append(y_pred_top3[index])
                clusters_top3[completion_with_clusters[index][0]] = tmp_top3


            for key in clusters.keys():
                print(f"    {key}:")
                acc_top1 = accuracy_score(true_cl[key], clusters[key])
                acc_top3 = top_3(true_cl[key], clusters_top3[key])
                ndcg = nDCG(true_cl[key], clusters_top3[key])
                cluster_size = len(true_cl[key])/len(y_true)*100
                print('        size: %.2f ' % cluster_size)
                print(f"        top 1 accuracy: {acc_top1}")
                print(f"        top 3 accuracy: {acc_top3}")
                print(f"        ndcg: {ndcg}\n")



        except Exception:
            logger.error("Exception")
    break


size: small
AffinityPropagation(damping=0.999, random_state=0)
top 1 accuracy: 0.5
top 3 accuracy: 0.5
ndcg: 0.5

default top 1 accuracy: 0.5
default top 3 accuracy: 0.5
default ndcg: 0.5

    CLUSTER_30:
        size: 50.00 
        top 1 accuracy: 0.0
        top 3 accuracy: 0.0
        ndcg: 0.0

    CLUSTER_28:
        size: 50.00 
        top 1 accuracy: 1.0
        top 3 accuracy: 1.0
        ndcg: 1.0

AgglomerativeClustering(metric='euclidean', n_clusters=13)
top 1 accuracy: 0.5
top 3 accuracy: 0.5
ndcg: 0.5

default top 1 accuracy: 0.5
default top 3 accuracy: 0.5
default ndcg: 0.5

    CLUSTER_5:
        size: 50.00 
        top 1 accuracy: 0.0
        top 3 accuracy: 0.0
        ndcg: 0.0

    CLUSTER_12:
        size: 50.00 
        top 1 accuracy: 1.0
        top 3 accuracy: 1.0
        ndcg: 1.0

KMeans(n_clusters=13, n_init='auto', random_state=0)
top 1 accuracy: 0.5
top 3 accuracy: 0.5
ndcg: 0.5

default top 1 accuracy: 0.5
default top 3 accuracy: 0.5
default ndcg: 0.5

