### Download Grammar

In [26]:
from CodeCheckList import loader

"""define language"""
python_language = "python"

languages = [python_language]

loader.download_grammars(languages)

/home/svelascodimate/miniconda3/envs/code-check-list/lib/python3.10/site-packages/CodeCheckList/grammars


### Load Model

In [27]:

"""define the model checkpoint"""
checkpoint = "huggingface/CodeBERTa-small-v1"

### Create Modules

In [28]:
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker

#create code tokenizer 
bert_tokenizer = CodeTokenizer.from_pretrained(checkpoint, python_language)

#create code masker
code_masker = Masker(bert_tokenizer)

### Node Types

In [29]:
print(bert_tokenizer.node_types)

['**', 'assert', 'for', 'global', 'integer', 'except', 'future_import_statement', 'parameters', '>=', 'for_in_clause', 'try_statement', '*', 'from', 'list_splat_pattern', '.', 'dictionary_splat_pattern', 'pass_statement', 'as', 'global_statement', 'conditional_expression', 'set_comprehension', 'import', 'if_clause', 'decorated_definition', 'else_clause', 'subscript', '+', 'break_statement', 'class_definition', 'attribute', 'expression_statement', 'decorator', 'type', 'primary_expression', 'if_statement', '//=', '<<', 'parameter', 'match_statement', '&', '/', '"', 'format_specifier', 'pattern_list', 'raise_statement', '!=', '_compound_statement', '|', 'wildcard_import', 'typed_parameter', 'none', 'tuple', '|=', 'if', 'with', 'nonlocal', 'format_expression', 'dotted_name', 'continue', 'float', '__future__', '->', 'nonlocal_statement', 'false', '**=', '>>', 'elif', '>', 'identifier', '_simple_statement', 'import_from_statement', 'slice', 'while_statement', 'and', 'import_prefix', 'else', 

### Encodings

In [30]:
"""example source code"""

code = "def multiply_numbers(a,b):\n    return a*b"
#code = "def hello_world(a,b):\n    print('hello world')"
#code = "def __ordered_values_by_indexes(self, data, inds): \"\"\" Return values (intensities) by indexes. Used for multiscale graph cut. data = [[0 1 1], [0 2 2], [0 2 2]] inds = [[0 1 2], [3 4 4], [5 4 4]] return: [0, 1, 1, 0, 2, 0] If the data are not consistent, it will take the maximal value \"\"\" # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
#code = "def __ordered_values_by_indexes(self, data, inds):  # get unique labels and their first indexes # lab, linds = np.unique(inds, return_index=True) # compute values by indexes # values = data.reshape(-1)[linds] # alternative slow implementation # if there are different data on same index, it will take # maximal value # lab = np.unique(inds) # values = [0]*len(lab) # for label in lab: # values[label] = np.max(data[inds == label]) # # values = np.asarray(values) # yet another implementation values = [None] * (np.max(inds) + 1) linear_inds = inds.ravel() linear_data = data.ravel() for i in range(0, len(linear_inds)): # going over all data pixels if values[linear_inds[i]] is None: # this index is found for first values[linear_inds[i]] = linear_data[i] elif values[linear_inds[i]] < linear_data[i]: # here can be changed maximal or minimal value values[linear_inds[i]] = linear_data[i] values = np.asarray(values) return values"
target_node_type = "*"

#encoding 
source_code_encoding = bert_tokenizer(code)

#masking
masked_code_encoding = code_masker(code, bert_tokenizer(code), bert_tokenizer.node_types.index(target_node_type))

assert len(source_code_encoding['input_ids']) == len(masked_code_encoding['input_ids'])

#masked code
masked_code = bert_tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == bert_tokenizer.tokenizer.bos_token_id or 
            token_id == bert_tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))

print(masked_code)

def multiply_numbers(a,b):
    return a<mask>b


### Code Prediction

In [31]:
from CodeCheckList.predictor import Predictor

predictor = Predictor.from_pretrained(checkpoint, bert_tokenizer)
predictions = predictor(masked_code_encoding, bert_tokenizer.tokenizer(code, return_tensors='pt'), 5)

### Evaluation

In [32]:
import CodeCheckList.utils as utils

prediction_number = 0
print('------------- CODE -------------')
print(code)
print('\n---------- MASKED -------------')
print(masked_code)
print('\n--------- PREDICTED -----------')
predicted_code = predictions[prediction_number]
print(predicted_code)
print('\n--------- AST COMPARE -----------')
filtered_nodes = []
filtered_nodes_predict = []
utils.find_nodes(bert_tokenizer.parser.parse(bytes(code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes)
utils.find_nodes(bert_tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, bert_tokenizer.node_types[bert_tokenizer.node_types.index(target_node_type)], filtered_nodes_predict)
print(len(filtered_nodes))
print(len(filtered_nodes_predict))
#base the evaluation on size comparison

------------- CODE -------------
def multiply_numbers(a,b):
    return a*b

---------- MASKED -------------
def multiply_numbers(a,b):
    return a<mask>b

--------- PREDICTED -----------
def multiply_numbers(a,b):
    return a,b

--------- AST COMPARE -----------
1
0


In [33]:
from CodeCheckList.evaluator import Evaluator

evaluator = Evaluator(checkpoint, python_language)

results = evaluator(code, 5)

print(results)

[{'**': []}, {'assert': []}, {'for': []}, {'global': []}, {'integer': []}, {'except': []}, {'future_import_statement': []}, {'parameters': [(1, 1, True), (1, 1, True), (1, 0, False), (1, 1, True), (1, 0, False)]}, {'>=': []}, {'for_in_clause': []}, {'try_statement': []}, {'*': [(1, 0, False), (1, 1, True), (1, 0, False), (1, 0, False), (1, 0, False)]}, {'from': []}, {'list_splat_pattern': []}, {'.': []}, {'dictionary_splat_pattern': []}, {'pass_statement': []}, {'as': []}, {'global_statement': []}, {'conditional_expression': []}, {'set_comprehension': []}, {'import': []}, {'if_clause': []}, {'decorated_definition': []}, {'else_clause': []}, {'subscript': []}, {'+': []}, {'break_statement': []}, {'class_definition': []}, {'attribute': []}, {'expression_statement': []}, {'decorator': []}, {'type': []}, {'primary_expression': []}, {'if_statement': []}, {'//=': []}, {'<<': []}, {'parameter': []}, {'match_statement': []}, {'&': []}, {'/': []}, {'"': []}, {'format_specifier': []}, {'pattern_

### Module

In [34]:
#| default_exp evaluator

In [35]:
#| export
import CodeCheckList

import CodeCheckList.utils as utils
from CodeCheckList.tokenizer import CodeTokenizer
from CodeCheckList.masker import Masker
from CodeCheckList.predictor import Predictor

In [36]:
#| export
class Evaluator:
    """Evaluator Module to perform all AST Evaluations"""
    def __init__(self, checkpoint: str, language):
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.masker = Masker(self.tokenizer)
        self.predictor = Predictor.from_pretrained(checkpoint, self.tokenizer)

    def __call__(self, source_code: str, number_of_predictions: int):
        evaluation_results = []
        for node_type_idx, node_type in enumerate(self.tokenizer.node_types):
            evaluation_results.append({node_type: self.evaluate_snippet(source_code, node_type_idx, number_of_predictions)})
        return evaluation_results
            
    def evaluate_snippet(self, source_code: str, target_node_type_idx: int, number_of_predictions: int):
        results=[]
        source_code_nodes = []
        utils.find_nodes(self.tokenizer.parser.parse(bytes(source_code, "utf8")).root_node, 
            self.tokenizer.node_types[target_node_type_idx], source_code_nodes)
        if len(source_code_nodes) == 0:
            return results

        #source_code_encoding = self.tokenizer(source_code)
        masked_code_encoding = self.masker(source_code, self.tokenizer(source_code), target_node_type_idx)
        #masked_code = self.tokenizer.tokenizer.decode(list(filter(lambda token_id: False if token_id == self.tokenizer.tokenizer.bos_token_id or 
        #    token_id == self.tokenizer.tokenizer.eos_token_id else True, masked_code_encoding['input_ids'])))
        predictions = self.predictor(masked_code_encoding, self.tokenizer.tokenizer(source_code, return_tensors='pt'), number_of_predictions)       

        for prediction_number in range(0, number_of_predictions):
            predicted_code = predictions[prediction_number]
            predicted_nodes = []
            utils.find_nodes(self.tokenizer.parser.parse(bytes(predicted_code, "utf8")).root_node, self.tokenizer.node_types[target_node_type_idx], predicted_nodes)
            results.append((len(source_code_nodes), len(predicted_nodes), len(predicted_nodes)>=len(source_code_nodes)))

        return results