# Evaluator Module

In [1]:
#| default_exp evaluator

In [2]:
#| export
import CodeCheckList
import pandas as pd

import CodeCheckList.utils as utils
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker
from CodeCheckList.predictor import Predictor

import statistics

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#| export
class Evaluator:
    """Evaluator Module to perform all AST Evaluations"""
    def __init__(self, checkpoint: str, language, gpu_available=False):
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.masker = Masker(self.tokenizer)
        self.predictor = Predictor.from_pretrained(checkpoint, self.tokenizer, gpu_available)

    def __call__(self, test_set, number_of_predictions: int):
        results_dict = self.evaluate_test_set(test_set, number_of_predictions)
        results_dataframe = pd.DataFrame([], columns=[
            'ast_element', 'occurences', 'jaccard_distance', 'sorensen_dice_distance', 'jaccard_distance_avg', 'sorensen_dice_distance_avg'])
        for result_index, result in enumerate(results_dict):
            results_dataframe.loc[len(results_dataframe.index)] = [self.tokenizer.node_types[result_index], result[0], result[1], result[2], result[3], result[4]]
        return results_dataframe
    
    def evaluate_test_set(self, test_set, number_of_predictions: int):
        results_dict = []
        for node_type in self.tokenizer.node_types:
            results_dict.append([0,                                           #ocurrences
                                [[] for i in range(0,number_of_predictions)], #jaccard per prediction
                                [[] for i in range(0,number_of_predictions)], #sorensen_dice per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg jaccard per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg sorensen_dice per prediction
                                ])
        for sample_index, sample in enumerate(test_set):
            #print('-------------evaluating sample:'+str(sample_index)+'---------------------')
            for node_type_idx, node_type in enumerate(self.tokenizer.node_types):
                node_type_results = self.evaluate_node_type_on_snippet(sample['whole_func_string'], node_type_idx, number_of_predictions)
                if(len(node_type_results)>0):
                    results_dict[node_type_idx][0] += node_type_results[0][0]
                    for prediction_number_index in range(0, number_of_predictions):
                        results_dict[node_type_idx][1][prediction_number_index].append(node_type_results[prediction_number_index][1])
                        results_dict[node_type_idx][2][prediction_number_index].append(node_type_results[prediction_number_index][2])
                        results_dict[node_type_idx][3][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][1][prediction_number_index]),3)
                        results_dict[node_type_idx][4][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][2][prediction_number_index]),3)
        return results_dict
        
    def evaluate_node_type_on_snippet(self, source_code: str, target_node_type_idx: int, number_of_predictions: int):
        results=[]

        source_code_tree = self.tokenizer.parser.parse(bytes(source_code, "utf8")).root_node
        source_code_nodes = []
        utils.find_nodes(source_code_tree,self.tokenizer.node_types[target_node_type_idx], source_code_nodes)
        if len(source_code_nodes) == 0:
            return results

        masked_code_encoding = self.masker(source_code, self.tokenizer(source_code), target_node_type_idx)
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)  

        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            jaccard_distance = 0        #the closest to 1, the best
            sorensen_dice_distance = 0  #the closest to 1, the best
            if utils.is_balanced_snippet(predicted_code):
                predicted_code_tree = self.tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node

                predicted_code_types = utils.get_node_type_set(predicted_code_tree)
                source_code_types = utils.get_node_type_set(source_code_tree)

                jaccard_distance = utils.calculate_jaccard_distance(predicted_code_types, source_code_types)
                sorensen_dice_distance = utils.calculate_sorensen_dice_distance(predicted_code_types, source_code_types)
            results.append([len(source_code_nodes), jaccard_distance, sorensen_dice_distance])
        return results


## Full Pipeline

### Download Grammar

In [4]:
from CodeCheckList import loader

"""define language"""
python_language = "python"

languages = [python_language]

loader.download_grammars(languages)

/home/svelascodimate/miniconda3/envs/code-check-list/lib/python3.10/site-packages/CodeCheckList/grammars


### Load Model

In [5]:

"""define the model checkpoint"""
checkpoint = "huggingface/CodeBERTa-small-v1"

### Create Modules

In [6]:
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker

#create code tokenizer 
bert_tokenizer = CodeTokenizer.from_pretrained(checkpoint, python_language)

#create code masker
code_masker = Masker(bert_tokenizer)

### Node Types

In [7]:
print(bert_tokenizer.node_types)

[':', 'ERROR', 'print_statement', 'augmented_assignment', 'named_expression', 'chevron', 'else_clause', 'for_statement', '|=', 'block', '}', 'identifier', '*=', 'type_conversion', 'continue', 'list_splat', 'try', 'concatenated_string', 'and', 'print', '/=', 'break', 'list_splat_pattern', 'import_statement', 'class', 'parameters', 'not_operator', 'pattern_list', 'format_specifier', 'is', '~', '>>', 'argument_list', 'string', 'else', 'if_clause', 'list', 'pattern', 'interpolation', 'delete_statement', '->', 'primary_expression', 'wildcard_import', 'case_clause', '+', '<', '=', 'none', 'dictionary_comprehension', 'continue_statement', '>>=', 'nonlocal', '>', 'await', 'assignment', 'set', '"', '/', 'keyword_argument', '%=', 'float', 'from', 'subscript', 'case', 'comment', 'assert_statement', 'positional_separator', '-', 'dictionary_splat_pattern', '^=', 'format_expression', 'expression_list', 'ellipsis', 'escape_sequence', ')', 'aliased_import', 'exec_statement', 'elif', 'case_pattern', 'p

### Encodings

In [8]:
"""example source code"""

code = "def multiply_numbers(a,b):\n    return a*b"
#code = "def scale(self, center=True, scale=True):\n        \"\"\"\nthe the\n\n\n                                                                                                                                                          _\n                     ____________=_=_===========________===______________________________==_____________________\n_______\n____\n\n___\n\n\n\n\n\n\n\n\n        return return)"
#code = "def hello_world(a,b):\n    print('hello world')"
#code = "def __ordered_values_by_indexes(self, data, inds): \"\"\" Return values (intensities) by indexes. Used for multiscale graph cut. data = [[0 1 1], [0 2 2], [0 2 2]] inds = [[0 1 2], [3 4 4], [5 4 4]] return: [0, 1, 1, 0, 2, 0] If the data are not consistent, it will take the maximal value \"\"\" # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
#code = "def __ordered_values_by_indexes(self, data, inds):  # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
target_node_type = "*"

#encoding 
source_code_encoding = bert_tokenizer(code)

#masking
masked_code_encoding = code_masker(code, bert_tokenizer(code), bert_tokenizer.node_types.index(target_node_type))

assert len(source_code_encoding['input_ids']) == len(masked_code_encoding['input_ids'])

#masked code
masked_code = bert_tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == bert_tokenizer.tokenizer.bos_token_id or 
            token_id == bert_tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))

print(masked_code)

def multiply_numbers(a,b):
    return a<mask>b


### Code Prediction

In [9]:
from CodeCheckList.predictor import Predictor

predictor = Predictor.from_pretrained(checkpoint, bert_tokenizer)
predictions = predictor(masked_code_encoding, bert_tokenizer.tokenizer(code, return_tensors='pt'), 5)

### Evaluation

In [10]:
import CodeCheckList.utils as utils

prediction_number = 0
print('------------- CODE -------------')
print(code)
print('\n---------- MASKED -------------')
print(masked_code)
print('\n--------- PREDICTED -----------')
predicted_code = predictions[prediction_number]
print(predicted_code)
print('\n--------- AST COMPARE -----------')
filtered_nodes = []
filtered_nodes_predict = []
utils.find_nodes(bert_tokenizer.parser.parse(bytes(code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes)
utils.find_nodes(bert_tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes_predict)
print(len(filtered_nodes))
print(len(filtered_nodes_predict))
#base the evaluation on size comparison

------------- CODE -------------
def multiply_numbers(a,b):
    return a*b

---------- MASKED -------------
def multiply_numbers(a,b):
    return a<mask>b

--------- PREDICTED -----------
def multiply_numbers(a,b):
    return a,b

--------- AST COMPARE -----------
1
0


## Testing

In [11]:
from datasets import load_dataset 
import CodeCheckList.utils as utils


evaluator = Evaluator(checkpoint, python_language)

max_token_number = bert_tokenizer.tokenizer.max_len_single_sentence
print(max_token_number)

test_set = load_dataset("code_search_net", split='test')
test_set = test_set.filter(lambda sample: True if sample['language']== python_language
            and len(sample['func_code_tokens']) <= max_token_number
            and len(bert_tokenizer.tokenizer(sample['whole_func_string'])['input_ids']) <= max_token_number else False, num_proc=1)

test_set = utils.get_random_sub_set_test_set(utils.get_test_sets(load_dataset("code_search_net", split='test'), "python", evaluator.tokenizer.tokenizer.max_len_single_sentence, evaluator.tokenizer), 2)

len(test_set)

510


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 101/101 [00:25<00:00,  3.96ba/s]
No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 101/101 [00:26<00:00,

2

In [12]:
number_of_predictions = 1
checkpoint = "huggingface/CodeBERTa-small-v1"
python_language = "python"

evaluator = Evaluator(checkpoint, python_language)

results_dataframe = evaluator(test_set, number_of_predictions)

results_dataframe.sort_values(by=['occurences'], ascending=False)


Unnamed: 0,ast_element,occurences,jaccard_distance,sorensen_dice_distance,jaccard_distance_avg,sorensen_dice_distance_avg
11,identifier,70,"[[0.09090909090909091, 0]]","[[0.9523809523809523, 0]]",[0.045],[0.476]
124,.,21,"[[0.0, 0]]","[[1.0, 0]]",[0.0],[0.5]
182,",",18,"[[0.0, 0]]","[[1.0, 0]]",[0.0],[0.5]
155,attribute,16,"[[0.0, 0]]","[[1.0, 0]]",[0.0],[0.5]
74,),15,"[[0.0, 0]]","[[1.0, 0]]",[0.0],[0.5]
...,...,...,...,...,...,...
77,elif,0,[[]],[[]],[0],[0]
78,case_pattern,0,[[]],[[]],[0],[0]
79,parenthesized_list_splat,0,[[]],[[]],[0],[0]
81,assert,0,[[]],[[]],[0],[0]
