# Evaluation Module

In [1]:
#| default_exp evaluator

In [2]:
#| export
import CodeSyntaxConcept

from CodeSyntaxConcept.tokenizer import CodeTokenizer
from CodeSyntaxConcept.parser import TreeSitterParser
import CodeSyntaxConcept.utils as utils
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#| hide
from nbdev.showdoc import *

In [14]:
#| export
class Evaluator:

    def __init__(self, checkpoint: str, language):
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.parser = TreeSitterParser(self.tokenizer)
    
    def __call__(self, test_set):
        test_set_concepts = pd.DataFrame([], columns=['whole_func_string', 'ast_concepts', 'model_tokenizer_concepts', 'model_input_ids', 'model_total_input_ids'])
        for test_sample in test_set: 
            ast_concepts = self.parser.process_source_code(test_sample['whole_func_string'])
            source_code_encoding, tokenizer_concepts =  self.parser.process_model_source_code(test_sample['whole_func_string'])
            test_set_concepts.loc[len(test_set_concepts.index)] = (test_sample['whole_func_string'], ast_concepts, tokenizer_concepts, source_code_encoding['input_ids'], len(source_code_encoding['input_ids']))
        return test_set_concepts

# Testing

In [15]:
from datasets import load_dataset    
import pandas as pd


checkpoint = "EleutherAI/gpt-neo-125M"
language = "python"
maximun_number_of_samples = 1

evaluator = Evaluator(checkpoint, language)

test_set = utils.get_sub_set_test_set(utils.get_test_sets(load_dataset("code_search_net", split='test'), "python", 2048, evaluator.tokenizer), maximun_number_of_samples)

No config specified, defaulting to: code_search_net/all
Found cached dataset code_search_net (/home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27)
  0%|          | 0/101 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2272 > 2048). Running this sequence through the model will result in indexing errors
 99%|█████████▉| 100/101 [00:20<00:00,  5.00ba/s]


In [16]:
print(evaluator(test_set))

                                   whole_func_string  \
0  def get_vid_from_url(url):\n        """Extract...   

                                        ast_concepts  \
0  [(def, def, function_definition), (get_vid_fro...   

                            model_tokenizer_concepts  \
0  [(4299, def, function_definition), (651, ident...   

                                     model_input_ids  model_total_input_ids  
0  [4299, 651, 62, 16921, 62, 6738, 62, 6371, 7, ...                    212  


In [17]:
print(evaluator.tokenizer.tokenizer.max_len_single_sentence)

2048
