# Evaluator Module

In [1]:
#| default_exp evaluator

In [2]:
#| export
import os
import pandas as pd

import CodeCheckList.utils as utils
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker
from CodeCheckList.predictor import Predictor
from CodeCheckList.judge import Judge

import statistics
import textdistance

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
class Evaluator:
    """Evaluator Module to perform all AST Evaluations"""
    def __init__(self, checkpoint: str, language, gpu_available=False):
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.masker = Masker(self.tokenizer)
        self.predictor = Predictor.from_pretrained(checkpoint, self.tokenizer, gpu_available)
        self.judge = Judge(self.tokenizer)

    def __call__(self, test_set, number_of_predictions: int, masking_rate: float):
        results_dict = self.evaluate_test_set(test_set, number_of_predictions, masking_rate)
        results_dataframe = pd.DataFrame([], columns=[
            'ast_element', 'occurences', 'jaccard', 'sorensen_dice', 'levenshtein', 'jaccard_avg', 'sorensen_dice_avg', 'levenshtein_avg'])
        for result_index, result in enumerate(results_dict):
            results_dataframe.loc[len(results_dataframe.index)] = [self.tokenizer.node_types[result_index], result[0], tuple(tuple(l) for l in result[1]), tuple(tuple(l) for l in result[2]), tuple(tuple(l) for l in result[3]), tuple(result[4]), tuple(result[5]), tuple(result[6])]
        return results_dataframe
    
    def evaluate_test_set(self, test_set, number_of_predictions: int, masking_rate: float):
        results_dict = []
        for node_type in self.tokenizer.node_types:
            results_dict.append([0,                                           #ocurrences
                                [[] for i in range(0,number_of_predictions)], #jaccard per prediction
                                [[] for i in range(0,number_of_predictions)], #sorensen_dice per prediction
                                [[] for i in range(0,number_of_predictions)], #levenshtein per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg jaccard per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg sorensen_dice per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg levenshtein per prediction
                                ])
        for sample_index, sample in enumerate(test_set):
            print('-------- evaluating sample:'+str(sample_index)+' --------')
            for node_type_idx, node_type in enumerate(self.tokenizer.node_types):
                node_type_results = self.evaluate_node_type_on_snippet(sample['whole_func_string'], node_type_idx, number_of_predictions, masking_rate)
                if(len(node_type_results)>0):
                    results_dict[node_type_idx][0] += node_type_results[0][0]
                    for prediction_number_index in range(0, number_of_predictions):
                        if(node_type_results[prediction_number_index][1]!=None):
                            results_dict[node_type_idx][1][prediction_number_index].append(node_type_results[prediction_number_index][1])
                            results_dict[node_type_idx][4][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][1][prediction_number_index]),3)
                        if(node_type_results[prediction_number_index][2]!=None):
                            results_dict[node_type_idx][2][prediction_number_index].append(node_type_results[prediction_number_index][2])
                            results_dict[node_type_idx][5][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][2][prediction_number_index]),3)
                        if(node_type_results[prediction_number_index][3]!=None):
                            results_dict[node_type_idx][3][prediction_number_index].append(node_type_results[prediction_number_index][3])
                            results_dict[node_type_idx][6][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][3][prediction_number_index]),3)
        return results_dict
        
    def evaluate_node_type_on_snippet(self, source_code: str, target_node_type_idx: int, number_of_predictions: int, masking_rate: float):
        results=[]
        
        source_code_tree = self.tokenizer.parser.parse(bytes(source_code, "utf8"))
        source_code_nodes = []
        utils.find_nodes(source_code_tree.root_node, self.tokenizer.node_types[target_node_type_idx], source_code_nodes)
        if len(source_code_nodes) == 0:
            return results

        masked_code_encoding = self.masker(source_code, self.tokenizer(source_code), target_node_type_idx, masking_rate)
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)  

        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            prediction_results = self.judge(source_code, predicted_code)
            results.append([len(source_code_nodes), prediction_results[0], prediction_results[1], prediction_results[2]])
        return results


## Full Pipeline

### Download Grammar

In [4]:
#|eval: false
from CodeCheckList import loader

"""define language"""
python_language = "python"

languages = [python_language]

loader.download_grammars(languages)

/home/svelascodimate/miniconda3/envs/code-check-list/lib/python3.10/site-packages/CodeCheckList/grammars


### Load Model

In [5]:
#|eval: false
"""define the model checkpoint"""
checkpoint = "huggingface/CodeBERTa-small-v1"

### Create Modules

In [6]:
#|eval: false
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker

#create code tokenizer 
bert_tokenizer = CodeTokenizer.from_pretrained(checkpoint, python_language)

#create code masker
code_masker = Masker(bert_tokenizer)

### Node Types

In [7]:
#|eval: false
print(bert_tokenizer.node_types)

['parenthesized_expression', 'lambda_parameters', 'parenthesized_list_splat', 'match', 'expression_statement', 'assignment', '>', 'true', 'dictionary_splat_pattern', 'nonlocal', 'if', 'tuple_pattern', 'list_splat_pattern', 'lambda', 'integer', 'case_clause', 'dictionary_comprehension', 'typed_default_parameter', 'ERROR', 'not_operator', 'assert', '^=', 'else_clause', '_simple_statement', 'expression_list', '%', 'for_statement', 'format_expression', '}', 'try_statement', '@', 'list', '@=', 'with_statement', 'wildcard_import', 'tuple', 'yield', 'global', 'pass', 'function_definition', 'identifier', 'del', ']', 'primary_expression', 'aliased_import', 'for_in_clause', 'as', 'subscript', 'type', 'float', ')', 'parameter', '__future__', '==', 'async', '=', 'return', 'continue', 'exec_statement', 'global_statement', 'with_item', '>>', 'module', ';', 'list_comprehension', '->', 'continue_statement', 'block', 'positional_separator', '<>', ':', '^', 'in', '&', '-', '<=', 'import_from_statement',

### Encodings

In [8]:
#|eval: false
"""example source code"""

code = "def multiply_numbers(a,b):\n    return a*b"
#code = "def scale(self, center=True, scale=True):\n        \"\"\"\nthe the\n\n\n                                                                                                                                                          _\n                     ____________=_=_===========________===______________________________==_____________________\n_______\n____\n\n___\n\n\n\n\n\n\n\n\n        return return)"
#code = "def hello_world(a,b):\n    print('hello world')"
#code = "def __ordered_values_by_indexes(self, data, inds): \"\"\" Return values (intensities) by indexes. Used for multiscale graph cut. data = [[0 1 1], [0 2 2], [0 2 2]] inds = [[0 1 2], [3 4 4], [5 4 4]] return: [0, 1, 1, 0, 2, 0] If the data are not consistent, it will take the maximal value \"\"\" # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
#code = "def __ordered_values_by_indexes(self, data, inds):  # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
target_node_type = "identifier"

#encoding 
source_code_encoding = bert_tokenizer(code)

#masking
masked_code_encoding = code_masker(code, bert_tokenizer(code), bert_tokenizer.node_types.index(target_node_type), 1)

assert len(source_code_encoding['input_ids']) == len(masked_code_encoding['input_ids'])

#masked code
masked_code = bert_tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == bert_tokenizer.tokenizer.bos_token_id or 
            token_id == bert_tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))

print(masked_code)

def<mask><mask><mask>(<mask>,<mask>):
    return<mask>*<mask>


### Code Prediction

In [9]:
#|eval: false
from CodeCheckList.predictor import Predictor

predictor = Predictor.from_pretrained(checkpoint, bert_tokenizer)
predictions = predictor(masked_code_encoding, bert_tokenizer.tokenizer(code, return_tensors='pt'), 5)

### Evaluation

In [10]:
#|eval: false
import CodeCheckList.utils as utils

prediction_number = 0
print('------------- CODE -------------')
print(code)
print('\n---------- MASKED -------------')
print(masked_code)
print('\n--------- PREDICTED -----------')
predicted_code = predictions[prediction_number]
print(predicted_code)
print('\n--------- AST COMPARE -----------')
filtered_nodes = []
filtered_nodes_predict = []
utils.find_nodes(bert_tokenizer.parser.parse(bytes(code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes)
utils.find_nodes(bert_tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes_predict)
print(len(filtered_nodes))
print(len(filtered_nodes_predict))
#base the evaluation on size comparison

------------- CODE -------------
def multiply_numbers(a,b):
    return a*b

---------- MASKED -------------
def<mask><mask><mask>(<mask>,<mask>):
    return<mask>*<mask>

--------- PREDICTED -----------
def __function(name, value):
    return f*args

--------- AST COMPARE -----------
5
5


## Testing

In [11]:
#|eval: false
from datasets import load_dataset 
import CodeCheckList.utils as utils


evaluator = Evaluator(checkpoint, python_language)

max_token_number = bert_tokenizer.tokenizer.max_len_single_sentence
print(max_token_number)

test_set = load_dataset("code_search_net", split='test')
test_set = utils.get_test_sets(test_set, python_language, max_token_number, bert_tokenizer)

510


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
100%|██████████| 101/101 [00:08<00:00, 12.28ba/s]


22176

In [12]:
#|eval: false
print(test_set[0]['whole_func_string'])

def get_vid_from_url(url):
        """Extracts video ID from URL.
        """
        return match1(url, r'youtu\.be/([^?/]+)') or \
          match1(url, r'youtube\.com/embed/([^/?]+)') or \
          match1(url, r'youtube\.com/v/([^/?]+)') or \
          match1(url, r'youtube\.com/watch/([^/?]+)') or \
          parse_query_param(url, 'v') or \
          parse_query_param(parse_query_param(url, 'u'), 'v')


In [14]:
#|eval: false
print('TOTAL PYTHON FILTERED SAMPLES: '+str(len(test_set)))
test_set = utils.get_random_sub_set_test_set(utils.get_test_sets(load_dataset("code_search_net", split='test'), "python", evaluator.tokenizer.tokenizer.max_len_single_sentence, evaluator.tokenizer), 5)
print('TOTAL SAMPLES TO EVALUATE: '+str(len(test_set)))
len(test_set)

TOTAL PYTHON FILTERED SAMPLES: 100529


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 101/101 [00:18<00:00,  5.60ba/s]

TOTAL SAMPLES TO EVALUATE: 5





5

In [15]:
#|eval: false
number_of_predictions = 3
checkpoint = "huggingface/CodeBERTa-small-v1"
python_language = "python"
masking_rate = 1

evaluator = Evaluator(checkpoint, python_language, gpu_available=False)

results_dataframe = evaluator(test_set, number_of_predictions, masking_rate)

results_dataframe.sort_values(by=['occurences'], ascending=False)


-------- evaluating sample:0 --------
-------- evaluating sample:1 --------
-------- evaluating sample:2 --------
-------- evaluating sample:3 --------
-------- evaluating sample:4 --------


Unnamed: 0,ast_element,occurences,jaccard,sorensen_dice,levenshtein,jaccard_avg,sorensen_dice_avg,levenshtein_avg
40,identifier,143,"((0.8540145985401459, 0.9285714285714286, 0.86...","((0.9212598425196851, 0.9629629629629629, 0.92...","((0.8382352941176471, 0.9285714285714286, 0.78...","(0.908, 0.875, 0.815)","(0.951, 0.932, 0.896)","(0.883, 0.857, 0.792)"
121,"""",68,"((1.0, 1.0, 1.0, 1.0, 1.0), (1.0, 1.0, 0.11137...","((1.0, 1.0, 1.0, 1.0, 1.0), (1.0, 1.0, 0.20042...","((1.0, 1.0, 1.0, 1.0, 1.0), (1.0, 1.0, 0.10688...","(1.0, 0.822, 0.697)","(1.0, 0.84, 0.788)","(1.0, 0.821, 0.698)"
55,=,36,"((1.0, 1.0, 0.9905437352245863, 1.0), (0.87301...","((1.0, 1.0, 0.995249406175772, 1.0), (0.932203...","((1.0, 1.0, 0.995249406175772, 1.0), (0.932203...","(0.998, 0.896, 0.887)","(0.999, 0.945, 0.94)","(0.999, 0.929, 0.906)"
166,(,35,"((1.0, 1.0, 0.9952718676122931, 1.0, 1.0), (1....","((1.0, 1.0, 0.9976303317535545, 1.0, 1.0), (1....","((1.0, 1.0, 0.9952718676122931, 1.0, 1.0), (1....","(0.999, 0.96, 0.913)","(1.0, 0.978, 0.952)","(0.999, 0.955, 0.912)"
50,),35,"((1.0, 1.0, 1.0, 1.0, 1.0), (0.933884297520661...","((1.0, 1.0, 1.0, 1.0, 1.0), (0.965811965811965...","((1.0, 1.0, 1.0, 1.0, 1.0), (0.932203389830508...","(1.0, 0.953, 0.961)","(1.0, 0.976, 0.98)","(1.0, 0.956, 0.962)"
...,...,...,...,...,...,...,...,...
76,import_from_statement,0,"((), (), ())","((), (), ())","((), (), ())","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)"
77,with_clause,0,"((), (), ())","((), (), ())","((), (), ())","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)"
79,interpolation,0,"((), (), ())","((), (), ())","((), (), ())","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)"
80,keyword_separator,0,"((), (), ())","((), (), ())","((), (), ())","(0, 0, 0)","(0, 0, 0)","(0, 0, 0)"
