# Evaluator Module

In [1]:
#| default_exp evaluator

In [2]:
#| export
import CodeCheckList
import pandas as pd

import CodeCheckList.utils as utils
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker
from CodeCheckList.predictor import Predictor

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
#| export
class Evaluator:
    """Evaluator Module to perform all AST Evaluations"""
    def __init__(self, checkpoint: str, language):
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.masker = Masker(self.tokenizer)
        self.predictor = Predictor.from_pretrained(checkpoint, self.tokenizer)

    def __call__(self, test_set, number_of_predictions: int):
        results_dict = self.evaluate_test_set(test_set, number_of_predictions)
        results_dataframe = pd.DataFrame([], columns=['ast_element', 'occurences', 'successful_predictions', 'failed_predictions', 'total_predictions', 'success_average', 'failure_average'])
        for result_index, result in enumerate(results_dict):
            results_dataframe.loc[len(results_dataframe.index)] = [self.tokenizer.node_types[result_index] ,result[0], result[1], result[2], result[3], result[4], result[5]]
        return results_dataframe
    
    def evaluate_test_set(self, test_set, number_of_predictions: int):
        results_dict = []
        for node_type in self.tokenizer.node_types:
            results_dict.append([0, [0 for i in range(0,number_of_predictions)], 
                                [0 for i in range(0,number_of_predictions)], 
                                [0 for i in range(0,number_of_predictions)], 
                                [0 for i in range(0,number_of_predictions)],
                                [0 for i in range(0,number_of_predictions)]])
        for sample_index, sample in enumerate(test_set):
            print('-------------evaluating sample:'+str(sample_index)+'---------------------')
            sample_results = self.evaluate_code_snippet(sample['whole_func_string'], number_of_predictions)
            for element_index, element_result in enumerate(sample_results):
                element_result_values = list(element_result.values())[0]
                if len(element_result_values) > 0:
                    results_dict[element_index][0] += element_result_values[0][0]
                    for prediction_number_index in range(0, number_of_predictions):
                        results_dict[element_index][1][prediction_number_index]+= (1 if element_result_values[prediction_number_index][2] else 0)
                        results_dict[element_index][2][prediction_number_index]+= (0 if element_result_values[prediction_number_index][2] else 1)
                        results_dict[element_index][3][prediction_number_index]= results_dict[element_index][1][prediction_number_index] + results_dict[element_index][2][prediction_number_index]
                        results_dict[element_index][4][prediction_number_index]= results_dict[element_index][1][prediction_number_index]/results_dict[element_index][3][prediction_number_index]
                        results_dict[element_index][5][prediction_number_index]= results_dict[element_index][2][prediction_number_index]/results_dict[element_index][3][prediction_number_index]
        return results_dict
        
    def evaluate_code_snippet(self, source_code: str, number_of_predictions: int):
        evaluation_results = []
        for node_type_idx, node_type in enumerate(self.tokenizer.node_types):
            evaluation_results.append({node_type: self.evaluate_node_type_on_snippet(source_code, node_type_idx, number_of_predictions)})
        return evaluation_results
            
    def evaluate_node_type_on_snippet(self, source_code: str, target_node_type_idx: int, number_of_predictions: int):
        results=[]
        source_code_nodes = []
        utils.find_nodes(self.tokenizer.parser.parse(bytes(source_code, "utf8")).root_node, 
            self.tokenizer.node_types[target_node_type_idx], source_code_nodes)
        if len(source_code_nodes) == 0:
            return results
        #source_code_encoding = self.tokenizer(source_code)
        masked_code_encoding = self.masker(source_code, self.tokenizer(source_code), target_node_type_idx)
        #masked_code = self.tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == self.tokenizer.tokenizer.bos_token_id or 
        #    token_id == self.tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)  
            
        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            predicted_nodes = []
            ########## THIS PART IS IMPORTANT
            #print('-I-')
            #print("\""+predicted_code+"\"")
            if utils.is_balanced_snippet(predicted_code):
                utils.find_nodes(self.tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, self.tokenizer.node_types[target_node_type_idx], predicted_nodes)
            results.append([len(source_code_nodes), len(predicted_nodes), len(predicted_nodes)>=len(source_code_nodes)])
            #print('-O-')
        return results

## Full Pipeline

### Download Grammar

In [None]:
from CodeCheckList import loader

"""define language"""
python_language = "python"

languages = [python_language]

loader.download_grammars(languages)

/home/svelascodimate/miniconda3/envs/code-check-list/lib/python3.10/site-packages/CodeCheckList/grammars


### Load Model

In [None]:

"""define the model checkpoint"""
checkpoint = "huggingface/CodeBERTa-small-v1"

### Create Modules

In [None]:
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker

#create code tokenizer 
bert_tokenizer = CodeTokenizer.from_pretrained(checkpoint, python_language)

#create code masker
code_masker = Masker(bert_tokenizer)

### Node Types

In [None]:
print(bert_tokenizer.node_types)

['comment', '==', 'future_import_statement', 'keyword_argument', 'aliased_import', 'finally_clause', 'binary_operator', '+=', 'continue_statement', 'attribute', 'expression', '*', 'integer', 'nonlocal_statement', '[', '%', 'type_conversion', '**', 'default_parameter', 'expression_statement', '_simple_statement', 'assignment', 'case_clause', 'conditional_expression', 'try_statement', 'del', 'tuple_pattern', 'false', 'string', 'list_pattern', 'raise_statement', 'typed_default_parameter', 'tuple', ')', '{{', '!=', '<<=', 'with_item', 'dictionary_comprehension', 'for_statement', '>', '>=', 'type', 'interpolation', 'parenthesized_expression', 'return_statement', 'positional_separator', '|', 'from', 'import_from_statement', 'parenthesized_list_splat', 'not', '^', 'list_splat_pattern', 'keyword_separator', '|=', 'global', 'pass', 'in', 'exec', '@', 'with', 'comparison_operator', 'break_statement', 'parameter', 'class_definition', 'pass_statement', 'while_statement', 'not_operator', 'return', 

### Encodings

In [None]:
"""example source code"""

code = "def multiply_numbers(a,b):\n    return a*b"
#code = "def scale(self, center=True, scale=True):\n        \"\"\"\nthe the\n\n\n                                                                                                                                                          _\n                     ____________=_=_===========________===______________________________==_____________________\n_______\n____\n\n___\n\n\n\n\n\n\n\n\n        return return)"
#code = "def hello_world(a,b):\n    print('hello world')"
#code = "def __ordered_values_by_indexes(self, data, inds): \"\"\" Return values (intensities) by indexes. Used for multiscale graph cut. data = [[0 1 1], [0 2 2], [0 2 2]] inds = [[0 1 2], [3 4 4], [5 4 4]] return: [0, 1, 1, 0, 2, 0] If the data are not consistent, it will take the maximal value \"\"\" # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
#code = "def __ordered_values_by_indexes(self, data, inds):  # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
target_node_type = "*"

#encoding 
source_code_encoding = bert_tokenizer(code)

#masking
masked_code_encoding = code_masker(code, bert_tokenizer(code), bert_tokenizer.node_types.index(target_node_type))

assert len(source_code_encoding['input_ids']) == len(masked_code_encoding['input_ids'])

#masked code
masked_code = bert_tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == bert_tokenizer.tokenizer.bos_token_id or 
            token_id == bert_tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))

print(masked_code)

def multiply_numbers(a,b):
    return a<mask>b


### Code Prediction

In [None]:
from CodeCheckList.predictor import Predictor

predictor = Predictor.from_pretrained(checkpoint, bert_tokenizer)
predictions = predictor(masked_code_encoding, bert_tokenizer.tokenizer(code, return_tensors='pt'), 5)

------------------Loading Model into GPU------------------


### Evaluation

In [None]:
import CodeCheckList.utils as utils

prediction_number = 0
print('------------- CODE -------------')
print(code)
print('\n---------- MASKED -------------')
print(masked_code)
print('\n--------- PREDICTED -----------')
predicted_code = predictions[prediction_number]
print(predicted_code)
print('\n--------- AST COMPARE -----------')
filtered_nodes = []
filtered_nodes_predict = []
utils.find_nodes(bert_tokenizer.parser.parse(bytes(code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes)
utils.find_nodes(bert_tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes_predict)
print(len(filtered_nodes))
print(len(filtered_nodes_predict))
#base the evaluation on size comparison

------------- CODE -------------
def multiply_numbers(a,b):
    return a*b

---------- MASKED -------------
def multiply_numbers(a,b):
    return a<mask>b

--------- PREDICTED -----------
def multiply_numbers(a,b):
    return a,b

--------- AST COMPARE -----------
1
0


## Testing

In [None]:
from datasets import load_dataset 
import CodeCheckList.utils as utils


evaluator = Evaluator(checkpoint, python_language)

max_token_number = bert_tokenizer.tokenizer.max_len_single_sentence
print(max_token_number)

test_set = load_dataset("code_search_net", split='test')
test_set = test_set.filter(lambda sample: True if sample['language']== python_language
            and len(sample['func_code_tokens']) <= max_token_number
            and len(bert_tokenizer.tokenizer(sample['whole_func_string'])['input_ids']) <= max_token_number else False, num_proc=1)

test_set = utils.get_random_sub_set_test_set(utils.get_test_sets(load_dataset("code_search_net", split='test'), "python", evaluator.tokenizer.tokenizer.max_len_single_sentence, evaluator.tokenizer), 2)

len(test_set)

------------------Loading Model into GPU------------------
510


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 101/101 [00:24<00:00,  4.06ba/s]
No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 101/101 [00:24<00:00,

2

In [None]:
number_of_predictions = 2
checkpoint = "huggingface/CodeBERTa-small-v1"
python_language = "python"

evaluator = Evaluator(checkpoint, python_language)

results_dataframe = evaluator(test_set, number_of_predictions)

results_dataframe.sort_values(by=['occurences'], ascending=False)


------------------Loading Model into GPU------------------
-------------evaluating sample:0---------------------
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
-------------evaluating sample:1---------------------
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction
valid_prediction


Unnamed: 0,ast_element,occurences,successful_predictions,failed_predictions,total_predictions,success_average,failure_average
112,identifier,60,"[1, 2]","[1, 0]","[2, 2]","[0.5, 1.0]","[0.5, 0.0]"
158,(,21,"[2, 1]","[0, 1]","[2, 2]","[1.0, 0.5]","[0.0, 0.5]"
33,),21,"[2, 1]","[0, 1]","[2, 2]","[1.0, 0.5]","[0.0, 0.5]"
123,argument_list,19,"[0, 0]","[2, 2]","[2, 2]","[0.0, 0.0]","[1.0, 1.0]"
153,call,19,"[0, 0]","[2, 2]","[2, 2]","[0.0, 0.0]","[1.0, 1.0]"
...,...,...,...,...,...,...,...
74,slice,0,"[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
76,lambda_parameters,0,"[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
77,except,0,"[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
78,primary_expression,0,"[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
