# Evaluator Module

In [1]:
#| default_exp evaluator

In [2]:
#| export
import os
import pandas as pd

import CodeCheckList.utils as utils
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker
from CodeCheckList.predictor import Predictor
from CodeCheckList.judge import Judge

import statistics
import textdistance



In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| export
class Evaluator:
    """Evaluator Module to perform all AST Evaluations"""
    def __init__(self, checkpoint: str, language, gpu_available=False, save_path: str = None):
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.masker = Masker(self.tokenizer)
        self.predictor = Predictor.from_pretrained(checkpoint, self.tokenizer, gpu_available)
        self.judge = Judge(self.tokenizer)
        self.save_path = save_path

    def __call__(self, test_set, concepts: list, masking_rate: float, code_field: str, random_sampling: int):
        result_list = self.evaluate_concepts_in_test_set(concepts, test_set, masking_rate, code_field, random_sampling)
        return self.save_checkpoint(result_list)
    
    def save_checkpoint(self, result_list: list):
        results_dataframe = pd.DataFrame([], columns=[
            'sample_id', 'ast_element', 'sample', 'masking_rate', 'numper_of_masked_tokens',
            'ast_element_ocurrences','mask_jaccard', 'mask_sorensen_dice', 'mask_levenshtein', 
            'mask_random_avg_jaccard', 'mask_random_avg_sorensen_dice', 'mask_random_avg_levenshtein',
            'mask_random_std_jaccard', 'mask_random_std_sorensen_dice', 'mask_random_std_levenshtein',
            'n_ast_errors', 'ast_levels', 'n_whitespaces_', 'complexity', 'nloc', 'token_counts', 'n_ast_nodes' #CONFOUNDERS
            ])
        for result_index, result in enumerate(result_list):
            results_dataframe.loc[len(results_dataframe.index)] = result
        if self.save_path != None:
            results_dataframe.to_csv(self.save_path)
        return results_dataframe

    
    def pipeline(self, test_set, number_of_predictions: int, masking_rate: float):
        """Deprecated"""
        results_dict = self.evaluate_test_set(test_set, number_of_predictions, masking_rate)
        results_dataframe = pd.DataFrame([], columns=[
            'ast_element', 'occurences', 'jaccard', 'sorensen_dice', 'levenshtein', 'jaccard_avg', 'sorensen_dice_avg', 'levenshtein_avg'])
        for result_index, result in enumerate(results_dict):
            results_dataframe.loc[len(results_dataframe.index)] = [self.tokenizer.node_types[result_index], result[0], tuple(tuple(l) for l in result[1]), tuple(tuple(l) for l in result[2]), tuple(tuple(l) for l in result[3]), tuple(result[4]), tuple(result[5]), tuple(result[6])]
        return results_dataframe
    
    def evaluate_test_set(self, test_set, number_of_predictions: int, masking_rate: float):
        """Deprecated"""
        results_dict = []
        for node_type in self.tokenizer.node_types:
            results_dict.append([0,                                           #ocurrences
                                [[] for i in range(0,number_of_predictions)], #jaccard per prediction
                                [[] for i in range(0,number_of_predictions)], #sorensen_dice per prediction
                                [[] for i in range(0,number_of_predictions)], #levenshtein per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg jaccard per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg sorensen_dice per prediction
                                [0 for i in range(0,number_of_predictions)],  #avg levenshtein per prediction
                                ])
        for sample_index, sample in enumerate(test_set):
            print('-------- evaluating sample:'+str(sample_index)+' --------')
            for node_type_idx, node_type in enumerate(self.tokenizer.node_types):
                node_type_results = self.evaluate_node_type_on_snippet(sample['whole_func_string'], node_type_idx, number_of_predictions, masking_rate)
                if(len(node_type_results)>0):
                    results_dict[node_type_idx][0] += node_type_results[0][0]
                    for prediction_number_index in range(0, number_of_predictions):
                        if(node_type_results[prediction_number_index][1]!=None):
                            results_dict[node_type_idx][1][prediction_number_index].append(node_type_results[prediction_number_index][1])
                            results_dict[node_type_idx][4][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][1][prediction_number_index]),3)
                        if(node_type_results[prediction_number_index][2]!=None):
                            results_dict[node_type_idx][2][prediction_number_index].append(node_type_results[prediction_number_index][2])
                            results_dict[node_type_idx][5][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][2][prediction_number_index]),3)
                        if(node_type_results[prediction_number_index][3]!=None):
                            results_dict[node_type_idx][3][prediction_number_index].append(node_type_results[prediction_number_index][3])
                            results_dict[node_type_idx][6][prediction_number_index] = round(statistics.mean(results_dict[node_type_idx][3][prediction_number_index]),3)
        return results_dict
        
    def evaluate_node_type_on_snippet(self, source_code: str, target_node_type_idx: int, number_of_predictions: int, masking_rate: float):
        results=[]
        source_code_tree = self.tokenizer.parser.parse(bytes(source_code, "utf8"))
        source_code_nodes = []
        utils.find_nodes(source_code_tree.root_node, self.tokenizer.node_types[target_node_type_idx], source_code_nodes)
        if len(source_code_nodes) == 0:
            return results, 0
        masked_code_encoding, number_of_masked_tokens = self.masker.mask_ast_tokens(source_code, self.tokenizer(source_code), target_node_type_idx, masking_rate)
        if number_of_masked_tokens == 0: #Not masking anything
            return results, 0
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)  
        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            prediction_results = self.judge(source_code, predicted_code)
            results.append([len(source_code_nodes), prediction_results[0], prediction_results[1], prediction_results[2]])
        return results, number_of_masked_tokens
    
    def evaluate_random_mask_on_snippet(self, source_code: str, number_of_predictions:int, number_tokens_to_mask: int):
        results=[]
        masked_code_encoding = self.masker.mask_random_tokens(self.tokenizer(source_code), number_tokens_to_mask)
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)
        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            prediction_results = self.judge(source_code, predicted_code)
            results.append([0, prediction_results[0], prediction_results[1], prediction_results[2]])
        return results
    
    def evaluate_concepts_in_test_set(self, concepts: list, test_set, masking_rate: float, code_field: str, random_sampling: int):
        test_set_results = []
        for sample_index, sample in enumerate(test_set):
            print('-------- evaluating sample:'+str(sample_index)+' --------')
            for concept in concepts: 
                concept_mask_results, number_of_masked_tokens = self.evaluate_node_type_on_snippet(sample[code_field], self.tokenizer.node_types.index(concept), 1, masking_rate)
                
                random_mask_results = [[],[],[]]
                for idx in range(0, random_sampling):
                    random_mask_result = self.evaluate_random_mask_on_snippet(sample[code_field], 1, number_of_masked_tokens)
                    random_mask_results[0].append(random_mask_result[0][1])
                    random_mask_results[1].append(random_mask_result[0][2])
                    random_mask_results[2].append(random_mask_result[0][3])

                if len(concept_mask_results)>0:
                    'n_ast_errors', 'ast_levels', 'n_whitespaces_', 'complexity', 'nloc', 'token_counts', 'n_ast_nodes'
                    test_set_results.append([sample_index, concept, sample[code_field], masking_rate, number_of_masked_tokens,
                                            concept_mask_results[0][0], concept_mask_results[0][1], concept_mask_results[0][2], concept_mask_results[0][3], 
                                            statistics.mean(random_mask_results[0]), statistics.mean(random_mask_results[1]), statistics.mean(random_mask_results[2]),
                                            statistics.stdev(random_mask_results[0]), statistics.stdev(random_mask_results[1]), statistics.stdev(random_mask_results[2]),
                                            sample['n_ast_errors'], sample['ast_levels'], sample['n_whitespaces_'], sample['complexity'], sample['nloc'], sample['token_counts'], sample['n_ast_nodes'] #CONFOUNDERS
                                            ])
                    if sample_index % 100 == 0: 
                        self.save_checkpoint(test_set_results)
        self.save_checkpoint(test_set_results)
        return test_set_results


## Full Pipeline

### Download Grammar

In [5]:
#|eval: false
from CodeCheckList import loader

"""define language"""
python_language = "python"

languages = [python_language]

loader.download_grammars(languages)

/usr/local/lib/python3.8/dist-packages/CodeCheckList/grammars


### Load Model

In [6]:
#|eval: false
"""define the model checkpoint"""
checkpoint = "huggingface/CodeBERTa-small-v1"

### Create Modules

In [7]:
#|eval: false
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker

#create code tokenizer 
bert_tokenizer = CodeTokenizer.from_pretrained(checkpoint, python_language)

#create code masker
code_masker = Masker(bert_tokenizer)

### Node Types

In [8]:
#|eval: false
print(bert_tokenizer.node_types)

['expression_statement', 'primary_expression', 'aliased_import', 'list_comprehension', 'type', 'dictionary_comprehension', 'named_expression', 'pass', 'ERROR', 'relative_import', '{{', 'string', 'except', 'false', 'none', 'parenthesized_expression', '}}', '(', 'assert', 'attribute', 'positional_separator', 'import_prefix', ':=', '>', '|', 'concatenated_string', 'format_expression', 'set_comprehension', 'import_from_statement', 'binary_operator', 'interpolation', 'if_statement', 'lambda', 'case_pattern', '->', 'call', '-=', 'future_import_statement', '+=', '**', '<>', '[', 'tuple', 'class', 'list', 'delete_statement', 'decorator', 'with_clause', 'with_item', 'typed_default_parameter', 'def', '=', 'break', 'elif', 'argument_list', 'format_specifier', 'case_clause', 'continue_statement', 'exec_statement', ')', 'case', 'global', 'assignment', 'import', '%', '//=', 'continue', '|=', 'typed_parameter', 'keyword_argument', '&=', '"', ']', 'elif_clause', 'class_definition', 'while', 'with', '>

### Encodings

In [9]:
#|eval: false
"""example source code"""

code = "def multiply_numbers(a,b):\n    return a*b"
#code = "def scale(self, center=True, scale=True):\n        \"\"\"\nthe the\n\n\n                                                                                                                                                          _\n                     ____________=_=_===========________===______________________________==_____________________\n_______\n____\n\n___\n\n\n\n\n\n\n\n\n        return return)"
#code = "def hello_world(a,b):\n    print('hello world')"
#code = "def __ordered_values_by_indexes(self, data, inds): \"\"\" Return values (intensities) by indexes. Used for multiscale graph cut. data = [[0 1 1], [0 2 2], [0 2 2]] inds = [[0 1 2], [3 4 4], [5 4 4]] return: [0, 1, 1, 0, 2, 0] If the data are not consistent, it will take the maximal value \"\"\" # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
#code = "def __ordered_values_by_indexes(self, data, inds):  # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
target_node_type = "identifier"

#encoding 
source_code_encoding = bert_tokenizer(code)

#masking
masked_code_encoding, number_of_masked_tokens = code_masker.mask_ast_tokens(code, bert_tokenizer(code), bert_tokenizer.node_types.index(target_node_type), 1)

assert len(source_code_encoding['input_ids']) == len(masked_code_encoding['input_ids'])

#masked code
masked_code = bert_tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == bert_tokenizer.tokenizer.bos_token_id or 
            token_id == bert_tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))

print(masked_code)

def<mask><mask><mask>(<mask>,<mask>):
    return<mask>*<mask>


### Code Prediction

In [10]:
#|eval: false
from CodeCheckList.predictor import Predictor

predictor = Predictor.from_pretrained(checkpoint, bert_tokenizer)
predictions = predictor(masked_code_encoding, bert_tokenizer.tokenizer(code, return_tensors='pt'), 5)

### Evaluation

In [11]:
#|eval: false
import CodeCheckList.utils as utils

prediction_number = 0
print('------------- CODE -------------')
print(code)
print('\n---------- MASKED -------------')
print(masked_code)
print('\n--------- PREDICTED -----------')
predicted_code = predictions[prediction_number]
print(predicted_code)
print('\n--------- AST COMPARE -----------')
filtered_nodes = []
filtered_nodes_predict = []
utils.find_nodes(bert_tokenizer.parser.parse(bytes(code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes)
utils.find_nodes(bert_tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes_predict)
print(len(filtered_nodes))
print(len(filtered_nodes_predict))
#base the evaluation on size comparison

------------- CODE -------------
def multiply_numbers(a,b):
    return a*b

---------- MASKED -------------
def<mask><mask><mask>(<mask>,<mask>):
    return<mask>*<mask>

--------- PREDICTED -----------
def __function(name, value):
    return f*args

--------- AST COMPARE -----------
5
5


## Testing

In [12]:
#|eval: false
from datasets import load_dataset 
import CodeCheckList.utils as utils
import json


evaluator = Evaluator(checkpoint, python_language)

max_token_number = bert_tokenizer.tokenizer.max_len_single_sentence
print(max_token_number)

test_set = load_dataset("code_search_net", split='test')
test_set = utils.get_test_sets(test_set, python_language, max_token_number, bert_tokenizer)

print(len(test_set))


510


No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/root/.cache/huggingface/datasets/code_search_net/all/1.0.0/8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1)


Filter:   0%|          | 0/100529 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


19408


In [13]:
#|eval: false
print(test_set[0]['whole_func_string'])

def get_vid_from_url(url):
        """Extracts video ID from URL.
        """
        return match1(url, r'youtu\.be/([^?/]+)') or \
          match1(url, r'youtube\.com/embed/([^/?]+)') or \
          match1(url, r'youtube\.com/v/([^/?]+)') or \
          match1(url, r'youtube\.com/watch/([^/?]+)') or \
          parse_query_param(url, 'v') or \
          parse_query_param(parse_query_param(url, 'u'), 'v')


In [14]:
#|eval: false
### LOADING GALERAS
test_set = json.load(open('/workspaces/CodeCheckList/semeru-datasets/galeras_curated_raw/airflow/data_1.json',))
test_set += json.load(open('/workspaces/CodeCheckList/semeru-datasets/galeras_curated_raw/AliceMind-Baba/dataset17.json',))

#test_set = json.load(open('/workspaces/CodeCheckList/semeru-datasets/galeras_previews_iteration_bk/combinedDataset/dataset.json',))
#test_set += json.load(open('/workspaces/CodeCheckList/semeru-datasets/galeras_previews_iteration_bk/combinedDataset/dataset0.json',))
#test_set += json.load(open('/workspaces/CodeCheckList/semeru-datasets/galeras_previews_iteration_bk/combinedDataset/dataset1.json',))

test_set = utils.get_test_sets_galeras(test_set, python_language, max_token_number, bert_tokenizer)
test_set = test_set[:10]

In [17]:
#|eval: false
number_of_predictions = 3
checkpoint = "huggingface/CodeBERTa-small-v1"
python_language = "python"
masking_rate = 1

evaluator = Evaluator(checkpoint, python_language, gpu_available=True)

#results_dataframe = evaluator(test_set, number_of_predictions, masking_rate)
concepts = ['for_statement', 'while_statement', 'return_statement', 'if_statement', 'comparison_operator', 'boolean_operator', 'for_in_clause', 'if_clause', 'identifier' ,'string']
results_dataframe = evaluator(test_set, concepts, masking_rate, 'code', 15)
#results_dataframe = evaluator(test_set, bert_tokenizer.node_types, masking_rate, 'code')

#results_dataframe.sort_values(by=['occurences'], ascending=False)

results_dataframe


------------------Loading Model into GPU------------------
-------- evaluating sample:0 --------
-------- evaluating sample:1 --------
-------- evaluating sample:2 --------
-------- evaluating sample:3 --------
-------- evaluating sample:4 --------
-------- evaluating sample:5 --------
-------- evaluating sample:6 --------
-------- evaluating sample:7 --------
-------- evaluating sample:8 --------
-------- evaluating sample:9 --------


Unnamed: 0,sample_id,ast_element,sample,masking_rate,numper_of_masked_tokens,ast_element_ocurrences,mask_jaccard,mask_sorensen_dice,mask_levenshtein,mask_random_avg_jaccard,...,mask_random_std_jaccard,mask_random_std_sorensen_dice,mask_random_std_levenshtein,n_ast_errors,ast_levels,n_whitespaces_,complexity,nloc,token_counts,n_ast_nodes
0,0,comparison_operator,def test_should_generate_secret_with_specified...,1,20,1,0.846154,0.916667,0.896907,0.939262,...,0.056257,0.030556,0.049163,0,15,29,1,15,50,96
1,0,identifier,def test_should_generate_secret_with_specified...,1,24,9,0.818182,0.9,0.834951,0.931147,...,0.069713,0.039115,0.053557,0,15,29,1,15,50,96
2,0,string,def test_should_generate_secret_with_specified...,1,52,11,0.425,0.596491,0.391753,0.789183,...,0.071678,0.046481,0.087159,0,15,29,1,15,50,96
3,1,comparison_operator,def test_should_correctly_handle_password_with...,1,66,1,0.502924,0.669261,0.512048,0.704585,...,0.184333,0.133976,0.192802,0,15,29,1,17,46,90
4,1,identifier,def test_should_correctly_handle_password_with...,1,24,6,0.947917,0.973262,0.9375,0.949499,...,0.052384,0.028038,0.037668,0,15,29,1,17,46,90
5,1,string,def test_should_correctly_handle_password_with...,1,107,11,0.318182,0.482759,0.351648,0.445253,...,0.201691,0.21177,0.205269,0,15,29,1,17,46,90
6,2,while_statement,"def assert_tasks_on_executor(self, executor, t...",1,60,1,0.825455,0.904382,0.83209,0.96644,...,0.023258,0.012153,0.019163,0,13,103,8,17,158,267
7,2,if_statement,"def assert_tasks_on_executor(self, executor, t...",1,42,1,0.860068,0.924771,0.880866,0.977678,...,0.043449,0.023696,0.036426,0,13,103,8,17,158,267
8,2,comparison_operator,"def assert_tasks_on_executor(self, executor, t...",1,33,5,0.856115,0.922481,0.813433,0.988746,...,0.012353,0.006276,0.009332,0,13,103,8,17,158,267
9,2,boolean_operator,"def assert_tasks_on_executor(self, executor, t...",1,13,1,0.918216,0.957364,0.91791,0.998517,...,0.004155,0.002092,0.002092,0,13,103,8,17,158,267
