# Evaluation Module

In [None]:
#| default_exp evaluator

In [None]:
#| export
import CodeSyntaxConcept

from CodeSyntaxConcept.tokenizer import CodeTokenizer
from CodeSyntaxConcept.parser import TreeSitterParser
import CodeSyntaxConcept.utils as utils
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
class Evaluator:

    def __init__(self, checkpoint: str, language):
        self.tokenizer = CodeTokenizer.from_pretrained(checkpoint, language)
        self.parser = TreeSitterParser(self.tokenizer)
    
    def __call__(self, test_set):
        test_set_concepts = pd.DataFrame([], columns=['whole_func_string', 'ast_concepts', 'model_tokenizer_concepts', 'model_input_ids', 'model_total_input_ids'])
        for test_sample in test_set: 
            ast_concepts = self.parser.process_source_code(test_sample['whole_func_string'])
            source_code_encoding, tokenizer_concepts =  self.parser.process_model_source_code(test_sample['whole_func_string'])
            test_set_concepts.loc[len(test_set_concepts.index)] = (test_sample['whole_func_string'], ast_concepts, tokenizer_concepts, source_code_encoding['input_ids'], len(source_code_encoding['input_ids']))
        return test_set_concepts

# Testing

In [None]:
#| hide
#| eval: false

from datasets import load_dataset    
import pandas as pd


checkpoint = "EleutherAI/gpt-neo-125M"
language = "python"
maximun_number_of_samples = 2

evaluator = Evaluator(checkpoint, language)

test_set = utils.get_random_sub_set_test_set(utils.get_test_sets(load_dataset("code_search_net", split='test'), "python", 2048, evaluator.tokenizer), maximun_number_of_samples)

No config specified, defaulting to: code_search_net/all


Downloading and preparing dataset code_search_net/all to /home/svelascodimate/.cache/huggingface/datasets/code_search_net/all/1.0.0/80a244ab541c6b2125350b764dc5c2b715f65f00de7a56107a28915fac173a27...


Downloading data files: 100%|██████████| 6/6 [00:00<00:00, 4808.14it/s]
Extracting data files: 100%|██████████| 6/6 [00:00<00:00, 1683.78it/s]


NotADirectoryError: [Errno 20] Not a directory: '/home/svelascodimate/.cache/huggingface/datasets/downloads/25ceeb4c25ab737d688bd56ea92bfbb1f199fe572470456cf2d675479f342ac7/python/final/jsonl/train'

In [None]:
#| hide
#| eval: false
print(evaluator(test_set).describe())

       model_total_input_ids
count               2.000000
mean              896.000000
std              1120.057141
min               104.000000
25%               500.000000
50%               896.000000
75%              1292.000000
max              1688.000000


In [None]:
#| hide
#| eval: false
print(evaluator.tokenizer.tokenizer.max_len_single_sentence)

2048
