# The ATE evaluation

counting rational tokens and classify them by semantic/ non-semantinc for either programming and natural language

In [2]:
from pathlib import Path
import csv
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools
import json
import nltk
import re

pd.options.display.float_format = '{:.2f}'.format

In [1]:
from code_rationales.loader import download_grammars
from tree_sitter import Language, Parser
import code_rationales

In [4]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [6]:
def param_default():
    return {
        'model_name' : '/workspaces/code-rationales/data/codeparrot-small/checkpoints/checkpoint-29000', 
        'cache_dir': '/workspaces/code-rationales/datax/df_cache_dir',
        'delimiter_sequence': '' ### BE VERY CAREFULL HERE ALWAYS VERIFY -> VERY IMPORTANT
    }

In [7]:
params = param_default()

## Natural language setup

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

## Taxonomies definition

In [26]:
#Programming Language Taxonomy
def pl_taxonomy_python() -> dict:
    return {
  "punctuation": ['{', '}', '[', ']', '(', ')','\"', ',', '.', '...', ';', ':'], 
  "exceptions": ['raise_statement','catch', 'try', 'finally', 'throw', 'throws', 'except'],
  "oop": ['def','class','instanceof','interface','private','protected','public','abstract','extends','package','this','implements','import','new','super'],
  "asserts": ['assert'],
  "types": ['tuple','set','list','pair','subscript','type','none','dictionary','integer','native','static','synchronized','transient','volatile','void','final','enum','byte','char','float','boolean','double','int','long','short','strictfp'],
  "conditionals": ['else', 'if', 'switch', 'case', 'default'],
  "loops": ['break', 'do', 'for', 'while', 'continue'],
  "operators": ['as','yield','is','@','in','and','or','not','**','slice','%','+','<','>','=','+','-','*','/','%','++','--','!','==','!=','>=','<=','&&','||','?',':','~','<<','>>','>>>','&','^','|','//'],
  "indentation": ['\n','\t'],
  "bool": ['true', 'false'], 
  "functional":['lambda','lambda_parameters'],
  "with" : ['with','with_item','with_statement','with_clause'], 
  "return" :['return'],
  "structural" : ['attribute', 'argument_list','parenthesized_expression','pattern_list','class_definition','function_definition','block'],
  "statements" : ['return_statement','break_statement','assignment','while_statement','expression_statement','assert_statement'],
  "expression": ['call','exec','async','ellipsis','unary_operator','binary_operator','as_pattern_target','boolean_operator','as_pattern','comparison_operator','conditional_expression','named_expression','not_operator','primary_expression','as_pattern'],
  "errors": ["ERROR"],
  "identifier":["identifier"],  
  "comment":["comment"],
  "string": ['string','interpolation','string_content','string_end','string_start','escape_sequence'], 
  "unknown": []
}

In [27]:
def nl_pos_taxonomy() -> dict: return {
    "nl_verb" : ['VBN', 'VBG', 'VBZ', 'VBP', 'VBD', 'VB'],
    "nl_noun" : ['NN', 'NNPS', 'NNS', 'NNP'],
    "nl_pronoun" : ['WP', 'PRP', 'PRP$', 'WP','WP$'], 
    "nl_adverb" : ['RBS','RBR', 'RB', 'WRB'], 
    "nl_adjetive" : ['JJR', 'JJS', 'JJ'], 
    "nl_determier" : ['DT','WDT','PDT'], 
    "nl_preposition" : ['IN', 'TO'],
    "nl_particle" : ['RP'],
    "nl_modal" : ['MD'],
    "nl_conjunction" : ['CC'],
    "nl_cardinal" : ['CD'],
    "nl_list": ['LS'],
    "nl_other" : ['FW', 'EX', 'SYM' , 'UH', 'POS', "''", '--',':', '(', ')', '.', ',', '``', '$']
}

In [28]:
def semantic_non_semantic_groups() -> dict:return{
    "nl_semantic": ['nl_verb','nl_noun','nl_pronoun'],
    "pl_semantic": ['types','exceptions','oop','conditionals','loops','bool','with','structural','asserts','statements'],
    "nl_non_semantic": ['nl_adverb','nl_adjetive','nl_determier','nl_preposition','nl_particle','nl_modal','nl_conjunction','nl_cardinal','nl_list'],
    "pl_non_semantic": ['punctuation','expression','operators','indentation','return','functional'],
    "unknown": ['unknown', 'nl_other', '']
}

In [112]:
def global_groups() -> dict:
    return {
        'sc_semantic': ['exceptions', 'oop', 'asserts', 'types', 'conditionals', 'loops', 'bool', 'structural', 'statements', 'with'], 
        'sc_nl': ['identifier', 'comment', 'string'],
        'sc_not_semantic': ['punctuation', 'operators', 'indentation', 'functional', 'return', 'expression'], ## TODO why unkown is here? 
        'sc_errors' : ['errors'], 
        'nl_semantic': ['nl_verb', 'nl_noun', 'nl_pronoun', 'nl_adjetive'],
        'nl_not_semantic' : ['nl_adverb', 'nl_determier', 'nl_preposition', 'nl_particle', 'nl_modal', 'nl_conjunction', 'nl_cardinal', 'nl_list', 'nl_other'],
        'unknown': ['unknown',''] ## NOTE DRC: I added this
    }

## AST management

In [1]:
def unroll_node_types(
    nested_node_types: dict  # node_types from tree-sitter
) -> list: # list of node types
    def iterate_and_unroll_dict(nested_node_types: dict, all_node_types: set):
        for key, value in nested_node_types.items():
            if key == 'type' and type(value) == str:
                all_node_types.add(value)
            if type(value) == dict:
                iterate_and_unroll_dict(value, all_node_types)
            if type(value) == list:
                for element in value:
                    iterate_and_unroll_dict(element, all_node_types) 
    all_node_types = set()
    for dictionary in nested_node_types:
        iterate_and_unroll_dict(dictionary, all_node_types)
    all_node_types.add('ERROR')
    return list(all_node_types)

In [2]:
def create_parser(lang: str):
    # Grab the node types from the tree-sitter language
    language = Language(f"{code_rationales.__path__[0]}/grammars/tree-sitter-languages.so", lang)
    node_path = f"{code_rationales.__path__[0]}/grammars/tree-sitter-{lang}/src/node-types.json"
    with open(node_path) as f:
            node_types = json.load(f)
    node_types = unroll_node_types(node_types)
    # Create a parser for the language
    parser = Parser()
    parser.set_language(language)
    return parser, node_types

In [3]:
def traverse(
    node,       # tree-sitter node
) -> None:
    """Traverse in a recursive way, a tree-sitter node and append results to a list."""
    results = []
    def traverse_tree(node, results):
        if node.type == 'string':
            results.append(node)
            return
        for n in node.children:
            traverse_tree(n, results)
        if not node.children:
            results.append(node)
    traverse_tree(node, results)
    return results

In [4]:
def convert_to_offset(
    point,              #point to convert
    lines: list         #list of lines in the source code
    ):
        """Convert the point to an offset"""
        row, column = point
        chars_in_rows = sum(map(len, lines[:row])) + row
        chars_in_columns = len(lines[row][:column])
        offset = chars_in_rows + chars_in_columns
        return offset

In [5]:
def get_node_span(node, lines):
    """Get the span position of the node in the code string"""
    start_span = convert_to_offset(node.start_point, lines)
    end_span = convert_to_offset(node.end_point, lines)
    return start_span, end_span

In [6]:
def is_token_span_in_node_span(tok_span, token: str, node_span, node_text: str):
    return (node_span[0] <= tok_span[0] and tok_span[1] <= node_span[1]) or \
            (node_span[0]-1 <= tok_span[0] and tok_span[1] <= node_span[1] and node_text in token)

In [7]:
def get_token_type(
    tok_span: tuple, # (start, end) position of a token in tokenizer
    token: str,   # token value
    nodes: list,     # list of tree-sitter nodes
    lines: list,     # list of lines in the code
) -> tuple: # (parent_type, token_type) of the token
    """Get the parent AST type and token AST type of a token."""
    for i, node in enumerate(nodes):
        if is_token_span_in_node_span(tok_span, token, get_node_span(node, lines), node.text.decode('utf-8')):
            return nodes[i].parent.type, nodes[i].type

In [None]:
def get_token_nodes(
    tok_span: tuple, # (start, end) position of a token in tokenizer
    token: str,      #actual token
    node,            # tree-sitter node
    lines: list,     # list of lines in the code
) -> list: 
    """Get all AST types for the given token span"""
    results = []
    def traverse_and_get_types(tok_span, node, lines, results) -> None:
        node_span = get_node_span(node, lines)
        if is_token_span_in_node_span(tok_span, token, node_span, node.text.decode('utf-8')):
            results.append(node)
        for n in node.children:
            traverse_and_get_types(tok_span, n, lines, results)
    traverse_and_get_types(tok_span, node, lines, results)
    return results

In [None]:
def get_nodes_by_type(
    node, 
    node_types: list
) -> list :
    def traverse_and_search(node, node_types, results):
        if node.type in node_types:
            results.append(node)
        for n in node.children:
            traverse_and_search(n, node_types ,results)
    results = []
    traverse_and_search(node, node_types, results)
    return results

## Taxonomy Mapping

In [None]:
def clean_results(global_results):
    def clean_dictonary(result_dict):
        clean_dict = result_dict.copy()
        for key, value in result_dict.items():
            if not value or not value['values']: 
                clean_dict.pop(key)
        return clean_dict
    for key, value in global_results.items():
        global_results[key] = clean_dictonary(value)
    return global_results

In [8]:
def search_category_by_token(taxonomy_dict: dict, token_type: str):
    for key, value in taxonomy_dict.items():
        if token_type in value:
            return key
    return 'unknown'

In [9]:
def map_to_taxonomy(taxonomy_dict: dict, result_dict: dict):
    result_dict = result_dict.copy()
    mappings = {token: {category : {'values': [], 'rationales': []} for category in taxonomy_dict.keys()} for token in result_dict.keys()}
    for target_token, value in result_dict.items():
        for source_token, props in value.items():
            mappings[target_token][search_category_by_token(taxonomy_dict, source_token)]['values'].append(props['values'])
            mappings[target_token][search_category_by_token(taxonomy_dict, source_token)]['rationales'].append(props['rationales'])
    return clean_results(mappings)

In [None]:
def map_local_results_to_taxonomy(taxonomy_dict:dict, local_results: dict):
    return dict(zip(local_results.keys(), map(lambda aggegrations: map_to_taxonomy(taxonomy_dict, aggegrations), local_results.values())))

### Model Sampling Generation

In [None]:
def df_sampled_generation(
        df_sampled_code, 
        model,
        tokenizer,
        number_samples_generation = 1,
        max_gen_tok = 100, 
        top_k = 0
    ):
    dict_generated_code = {i: [] for i in range(number_samples_generation)}
    for idx_prompt, prompt in enumerate(df_sampled_code['prompt']):
        input = tokenizer([prompt], return_tensors="pt")
        input.to(model.device)
        outputs = model.generate(**input, do_sample=True,
                                 max_length=len(df_sampled_code['input_ids'][idx_prompt]), ##Force rationalization
                                 top_k=top_k, 
                                 num_return_sequences=number_samples_generation, 
                                 pad_token_id=tokenizer.eos_token_id)
        for index, output in enumerate(outputs):
            dict_generated_code[index].append(output.tolist())
    df_temp = pd.DataFrame().from_dict(data=dict_generated_code) # DataFrame from Generation
    df_temp = pd.concat([df_sampled_code.reset_index(), df_temp ], axis=1) #Index before concating
    return df_temp

### Running Rationales

In [None]:
#If the model is not fine-tuned or compatible, it will rise an error
#This function works for one tensor of source token and one tensor of target tokens
def rationalize_model(model, tokenizer, input_ids, max_token_size: int, verbose=True):
    torch.cuda.empty_cache() #Cleaning Cache
    all_rationales, log = rationalize_lm(
        model = model,
        input_ids = input_ids[:max_token_size],
        tokenizer = tokenizer,
        verbose = verbose,
        max_steps=1024 #Max number of steps for greedy rationalization
    )
    return all_rationales, log

In [None]:
def run_multiple_rational(
    model,
    tokenizer, 
    arr_target_tokens, 
    seq_id, #mapping sequence id
    max_token_size,
    verbose=True
):
    arr_log = []
    for index, val in enumerate(arr_target_tokens):
        all_rationales, log = rationalize_model(
            model=model, 
            tokenizer=tokenizer, 
            input_ids=val,
            max_token_size=max_token_size,
            verbose=False
        )
        arr_log.append(log)
    arr_code_rationales = [ log['rationalization'] for log in arr_log ] #extracting just rationalizations
    arr_from_sentence = [ list(np.full( len(val), seq_id[arr_i] )) #arr_i maps to the real sequence id
                            for arr_i, val in enumerate(arr_code_rationales)]
    arr_code_rationales = sum( arr_code_rationales, [] ) #flatting
    arr_from_sentence = sum( arr_from_sentence, [] ) #flatting
    return arr_code_rationales, arr_from_sentence

In [None]:
def pandas_rationales( arr_code_rationales, arr_from_sentence ):
    #Creating pandas_1 {p_rationale}
    rational = lambda list_log,typeset: [ (dict_tok['added_token_text'],round(dict_tok['true_token_prob'],6)) for dict_tok in list_log if dict_tok['from']==typeset]
    log = lambda log_row: [(log_dict['added_token_text'],log_dict['true_token_prob']) for log_dict in log_row] #Typeset

    log_position = lambda log_row: [log_dict['added_token_position'] for log_dict in log_row] #Position of the Rationale
    log_prediction = lambda log_row: [log_dict['true_token_prob'] for log_dict in log_row] #Rationale Prob

    p_rationale = pd.DataFrame()

    p_rationale['goal_token'] = [dict_token['goal_word'] for dict_token in arr_code_rationales]
    p_rationale['from_seq_id'] = arr_from_sentence

    p_rationale['typesets_tgt'] = [ log(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    
    p_rationale['rationale_pos_tgt'] = [ log_position(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    p_rationale['rationale_prob_tgt'] = [ log_prediction(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]


    return p_rationale

In [None]:
#Running Rationalization
def run_code_rational( 
        df_generated_input,
        tensor_size, #Control the size of the experiment, 
        model,
        tokenizer,
        experiment = '5',
        batch_size = 100, 
        max_token_size = 44,
        verbose = True 
    ):

    arr_rationals = []
    arr_from_seq = []

    for i in range( 0 , tensor_size , batch_size ):
        print('************************' + str(i) + '************************')
        t_generated_input = df_generated_input[experiment].values[i:i+batch_size]
        t_generated_input = [ torch.tensor(s).to(model.device) for s in t_generated_input]

        t_arr_rationals,t_arr_from_seq = run_multiple_rational(
            model = model,
            tokenizer = tokenizer,
            arr_target_tokens =  t_generated_input, 
            seq_id = list(range(i,i+batch_size)),
            max_token_size = len(t_generated_input[0]),
            verbose = verbose
        )

        arr_rationals = arr_rationals + t_arr_rationals
        arr_from_seq = arr_from_seq + t_arr_from_seq

        torch.cuda.empty_cache() #Cleaning Cache
        
    print("Experiment Finished: " + str(experiment))
    return pandas_rationales( arr_rationals, arr_from_seq )

In [None]:
def run_code_rational_all_set(exp, df_generated_input, model, tokenizer, tensor_n = 100, BATCH = 10): #When Tensor_n and batch differs then 'from_seq_id' is lost
    torch.cuda.empty_cache() #Cleaning Cache
    EXP = exp
    test_arr_rationals = run_code_rational( 
            df_generated_input,
            tensor_n,
            model, 
            tokenizer,
            experiment = EXP,
            batch_size = BATCH,
            verbose = False 
        )
    #Saving process
    return test_arr_rationals

### Rationales Tagging

In [None]:
calculate_right_span = lambda start_idx, end_idx, df : len(''.join(map(str, df.loc[start_idx:end_idx, 'goal_token'].tolist())))
calculate_span = lambda right_span, token : (right_span-len(str(token)), right_span)
delete_leading_spaces = lambda string: re.sub(r'^\s+', '', string)
delete_leading_breaks = lambda string: re.sub(r'^\n+', '', string)

In [None]:
def add_first_token_row(df):
    df.loc[-1] = [df['typesets_tgt'][0][0][0], df['from_seq_id'][0], None, None, None, df['exp'][0]]
    df.index = df.index + 1
    df = df.sort_index()
    return df

In [None]:
def add_auxiliary_columns_to_experiment_result(df, delimiter_sequence: str):
    df.insert(0, 'rational_pos', [i for i in range(len(df))])
    initial_token = df['goal_token'][0]
    ### TOKEN TYPE COLUMN
    token_type_column = ['src'] * len(df)
    sequence = initial_token
    for idx, goal_token in enumerate(df['goal_token']):
        if delimiter_sequence not in sequence:
            token_type_column[idx] = 'nl'
            sequence+=goal_token
    df['token_type'] = token_type_column
    src_initial_token_idx = df[df['token_type'] == 'src'].first_valid_index()
    df['span'] = [None] * len(df[:src_initial_token_idx]) + [calculate_span(calculate_right_span(src_initial_token_idx, index, df), token) for index, token in df[src_initial_token_idx:]['goal_token'].items()]


In [None]:
def fill_nl_tags_in_experiment_result(df, nl_ast_types, nl_pos_types, parser):
    #initial_token = df['typesets_tgt'][0][0][0] if df[df['token_type'] == 'src'].first_valid_index() == 0 else ''
    ##### POS TAGS FOR NL PART
    target_nl = ''.join(df[df['token_type'] == 'nl']['goal_token'].map(lambda value: str(value)))
    pos_tags = nltk.pos_tag(nltk.word_tokenize(target_nl))
    for idx in range(df[df['token_type']== 'src'].first_valid_index()):
        nl_tags = list(map(lambda tag: tag[1] if tag[1] in nl_pos_types else None, filter(lambda tag: tag[0] in str(df['goal_token'][idx]), pos_tags)))
        if nl_tags: df.at[idx, 'tags'] = df['tags'][idx] + [nl_tags[-1]]
    ##### POS TAGS FOR CODE PART
    target_code = ''.join(df[df['token_type'] == 'src']['goal_token'].map(lambda value: str(value)))
    nl_target_nodes = get_nodes_by_type(parser.parse(bytes(target_code, 'utf8')).root_node, nl_ast_types)
    for token_idx in range(df[df['token_type'] == 'src'].first_valid_index(), len(df['span'])):
                for nl_target_node in nl_target_nodes:
                    if is_token_span_in_node_span(df['span'][token_idx], df['goal_token'][token_idx], get_node_span(nl_target_node, target_code.split("\n")), nl_target_node.text.decode('utf-8')) and \
                            (str(df['goal_token'][token_idx]) in nl_target_node.text.decode('utf-8') or nl_target_node.text.decode('utf-8') in str(df['goal_token'][token_idx])):
                            tagged_token_list = list(filter(lambda tagged_token: str(tagged_token[0]).replace(' ','') in str(df['goal_token'][token_idx]).replace(' ','') or str(df['goal_token'][token_idx]).replace(' ','') in str(tagged_token[0]).replace(' ',''), \
                                                        nltk.pos_tag( nltk.word_tokenize(nl_target_node.text.decode('utf-8')))))
                            if len(tagged_token_list)>0 and tagged_token_list[0][1] in nl_pos_types and tagged_token_list[0][1] not in df['tags'][token_idx]: df.at[token_idx, 'tags'] = df['tags'][token_idx] + [tagged_token_list[0][1]]

In [None]:
def fill_ast_tags_in_experiment_result(df, parser):
    target_code = ''.join(df[df['token_type'] == 'src']['goal_token'].map(lambda value: str(value)))
    #target_code = delete_leading_breaks(delete_leading_spaces(target_code))
    src_initial_token_idx = df[df['token_type'] == 'src'].first_valid_index()
    target_ast = parser.parse(bytes(target_code, 'utf8')).root_node
    for token_idx in range(src_initial_token_idx, len(df)):
        df.at[token_idx, 'tags'] = df['tags'][token_idx] + list(map(lambda node: node.type, get_token_nodes(df['span'][token_idx], df['goal_token'][token_idx], target_ast, target_code.split("\n"))))

In [None]:
def tag_rationals(experiment_results: list, nl_ast_types: list, nl_pos_types: list, delimiter_sequence: str, parser):
    experiments = {}
    for exp_idx, df_experiment in enumerate(experiment_results):
        experiment_results = []
        experiment_rational_results = [df_experiment[(df_experiment['from_seq_id'] == sample_idx) | \
                                                     (df_experiment['from_seq_id'] == str(sample_idx))].reset_index() \
                                                    for sample_idx in range(len(prompts))]
        print('*'*10 +'Tagging rationals for exp: ' +str(exp_idx) + '*'*10)
        for experiment_rational_result in experiment_rational_results:
            experiment_rational_result = experiment_rational_result.drop('index', axis=1)
            experiment_rational_result = add_first_token_row(experiment_rational_result)
            add_auxiliary_columns_to_experiment_result(experiment_rational_result, delimiter_sequence)
            experiment_rational_result['tags'] = [[]]*len(experiment_rational_result)
            fill_nl_tags_in_experiment_result(experiment_rational_result, nl_ast_types, nl_pos_types, parser)
            fill_ast_tags_in_experiment_result(experiment_rational_result, parser)
            experiment_results.append(experiment_rational_result)
        experiments[exp_idx] = experiment_results
    return experiments
            

### Rationales Aggregation

In [None]:
def aggregate_rationals(global_tagged_results: dict, ast_node_types: list, nl_pos_types: list, number_samples: int):
    aggregation_results = {sample_id: None  for sample_id in range(number_samples)}
    for exp_idx, experiment_results in global_tagged_results.items():
        print('*'*10 +'Aggregrating rationals for exp: ' +str(exp_idx) + '*'*10)
        for experiment_result in experiment_results:
            ### GET INFORMATION OF FIRST TOKEN
            #sample_results = {str(pos+1)+'['+str(token)+']' : {node_type : {'values': [], 'rationales': []} for node_type in ast_node_types + nl_pos_types} for pos, token in enumerate(experiment_result['goal_token'].tolist())}
            sample_results = {str(token_pos)+'['+str(experiment_result['goal_token'][token_pos])+']' : {node_type : {'values': [], 'rationales': []} for node_type in ast_node_types + nl_pos_types} for token_pos in range(1, len(experiment_result['rational_pos']))}
            for target_idx, target_token in enumerate(experiment_result['goal_token'].tolist()): 
                if target_idx > 0: # INITIAL TOKEN IS IGNORED
                    for rational_idx, rational_pos in enumerate(experiment_result['rationale_pos_tgt'][target_idx]):
                        for rational_tag in experiment_result['tags'][rational_pos]: 
                            if rational_tag:
                                try:
                                    sample_results[str(target_idx)+'['+str(target_token)+']'][rational_tag]['values'].append(experiment_result['rationale_prob_tgt'][target_idx][rational_idx])
                                    sample_results[str(target_idx)+'['+str(target_token)+']'][rational_tag]['rationales'].append(str(rational_pos)+'['+str(experiment_result['goal_token'][rational_pos])+']')
                                except Exception as e:
                                    print('An Error Occurred')
            aggregation_results[experiment_result['from_seq_id'].unique()[0]] = clean_results(sample_results)
    return aggregation_results

## --------------- Reading form previous tagged processed files ------------

In [4]:
import json
import pandas as pd

In [12]:
def param_default():
    return {
        'dataset' : 'code_completion_random_cut_5k_30_512_tokens',
        #'dataset' : 'code_completion_docstring_random_cut_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_signature_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_5k_30_150_tokens',
        'rational_results': '/workspaces/code-rationales/data/rationales/gpt',
        'tagged_rationales': '/workspaces/code-rationales/data/tagged_rationales',
        'delimiter_sequence': '', ## VERY IMPOETANT
        'num_samples' : 100, 
        'size_samples' : 44,
        'num_experiments': 30,
        'model_name' : '/workspaces/code-rationales/data/codeparrot-small/checkpoints/checkpoint-29000', 
        'cache_dir': '/workspaces/code-rationales/datax/df_cache_dir',
        'galeras_path': '/workspaces/code-rationales/semeru-datasets/semeru/galeras/code_rationales/'
    }
params = param_default()

In [15]:
path_tagged = '{}/{}{}{}.csv'.format(params['tagged_rationales'], params['dataset'],'_exp_' , str(0))
path_dataset = '{}{}.json'.format(params['galeras_path'],params['dataset'])
tagged_results = pd.read_csv(path_tagged)
with open(path_dataset) as json_file:
    galeras_dataset = json.load(json_file)


In [18]:
galeras_dataset = pd.DataFrame.from_dict(galeras_dataset)

In [19]:
galeras_dataset.head()

Unnamed: 0,id,commit_id,repo,path,file_name,fun_name,commit_message,code,url,language,...,n_whitespaces,n_words,vocab_size,complexity,nloc,token_counts,n_ast_nodes,n_identifiers,signature,prompt
0,159555,f9437064b3501869f5f56fb9e6d345d81ffeec5e,rasa,tests/core/test_evaluation.py,test_evaluation.py,skip_on_CI,Skip e2e tests on CI: these tests take too lon...,def skip_on_CI() -> bool:\n \n return os...,https://github.com/RasaHQ/rasa.git,Python,...,18,12,12,1,3,28,55,5,def skip_on_CI() -> bool,def skip_on_CI() -> bool:\n \n return os...
1,276430,84afc5193d38057e2e2badf9c889ea87d80d8fbf,keras,keras/tests/custom_training_loop_test.py,custom_training_loop_test.py,call,Reformatting the codebase with black.\n\nPiper...,"def call(self, inputs):\n self.add_loss...",https://github.com/keras-team/keras.git,Python,...,21,8,8,1,3,24,38,7,"def call(self, inputs)","def call(self, inputs):\n self.add_loss..."
2,210956,39531637b5675a36409c303db022bfab90939896,PaddleDetection,deploy/python/det_keypoint_unite_infer.py,det_keypoint_unite_infer.py,smoothing_factor,one euro filter and ema smoothing for keypoint...,"def smoothing_factor(self, te, fc):\n r...",https://github.com/PaddlePaddle/PaddleDetectio...,Python,...,36,19,16,1,3,28,42,7,"def smoothing_factor(self, te, fc)","def smoothing_factor(self, te, fc):\n r..."
3,190542,4fc3616712edb19179b17dd270ad6cf63abf99c2,DeOldify,fastai/vision/image.py,image.py,data,Upgrading to support latest Pytorch version,"def data(self)->TensorImage:\n ""Return ...",https://github.com/jantic/DeOldify.git,Python,...,24,11,11,1,3,12,22,4,def data(self)->TensorImage,"def data(self)->TensorImage:\n ""Return ..."
4,189667,e040bcacd38378386749db18aeba575b93f4ebca,manim,manim/mobject/geometry/arc.py,arc.py,stop_angle,Improved structure of the :mod:`.mobject` modu...,def stop_angle(self):\n return angle_of...,https://github.com/ManimCommunity/manim.git,Python,...,14,8,8,1,2,24,39,6,def stop_angle(self),def stop_angle(self):\n return angle_of...


In [6]:
tagged_results

Unnamed: 0,rational_pos,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt,exp,token_type,span,tags
0,0,def,0,,,,0,src,"(0, 3)","['module', 'function_definition', 'def']"
1,1,skip,0,"[('def', 0.00029721998726017773)]",[0],[0.00029721998726017773],0,src,"(3, 8)","['module', 'function_definition']"
2,2,_,0,"[(' skip', 0.3496493101119995)]",[1],[0.3496493101119995],0,src,"(8, 9)","['NN', 'module', 'function_definition', 'ident..."
3,3,on,0,"[('_', 0.001134732156060636), (' skip', 0.0059...","[2, 1, 0]","[0.001134732156060636, 0.0059091681614518166, ...",0,src,"(9, 11)","['NN', 'module', 'function_definition', 'ident..."
4,4,_,0,"[('on', 0.2272750586271286)]",[3],[0.2272750586271286],0,src,"(11, 12)","['NN', 'module', 'function_definition', 'ident..."
...,...,...,...,...,...,...,...,...,...,...
4395,39,(',99,"[('override', 0.030208399519324303), (' import...","[38, 14, 33, 35, 2]","[0.030208399519324303, 0.08234483003616333, 0....",0,src,"(164, 166)","['module', 'function_definition', 'block', 'ER..."
4396,40,pip,99,"[(""('"", 0.00017135975940618664), ('pip', 0.240...","[39, 16]","[0.00017135975940618664, 0.24006126821041107]",0,src,"(166, 169)","['NNS', 'module', 'function_definition', 'bloc..."
4397,41,"',",99,"[('pip', 0.024009736254811287), (""('"", 0.13641...","[40, 39, 38]","[0.024009736254811287, 0.13641567528247833, 0....",0,src,"(169, 171)","['module', 'function_definition', 'block', 'ER..."
4398,42,',99,"[(""',"", 0.31870272755622864)]",[41],[0.31870272755622864],0,src,"(171, 173)","['module', 'function_definition', 'block', 'ER..."


In [21]:
tagged_results.head()

Unnamed: 0,rational_pos,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt,exp,token_type,span,tags,tag_count
0,0,def,0,,,,0,src,"(0, 3)","['module', 'function_definition', 'def']",40
1,1,skip,0,"[('def', 0.00029721998726017773)]",[0],[0.00029721998726017773],0,src,"(3, 8)","['module', 'function_definition']",33
2,2,_,0,"[(' skip', 0.3496493101119995)]",[1],[0.3496493101119995],0,src,"(8, 9)","['NN', 'module', 'function_definition', 'ident...",53
3,3,on,0,"[('_', 0.001134732156060636), (' skip', 0.0059...","[2, 1, 0]","[0.001134732156060636, 0.0059091681614518166, ...",0,src,"(9, 11)","['NN', 'module', 'function_definition', 'ident...",53
4,4,_,0,"[('on', 0.2272750586271286)]",[3],[0.2272750586271286],0,src,"(11, 12)","['NN', 'module', 'function_definition', 'ident...",53


In [48]:
import re

In [94]:
def convert_string_to_array(input):
    pattern = "(',\s+')|(\')"
    x = re.sub(pattern," ",input)
    return x.split(" ")[1:-1]

In [39]:
def get_keys_by_value(values, target):
    keys = []
    for key, value in values.items():
        if target in value:
            keys.append(key)
    return keys

In [121]:
from collections import Counter

In [119]:
def tag_classification(tags) -> dict:
    sc_dict = pl_taxonomy_python()
    nl_dict = nl_pos_taxonomy()
    categories = list()
    semantic_dic = global_groups()
    semantic_non_semantic = list()
    for tag in tags:
        sc_keys = get_keys_by_value(sc_dict,tag)
        nl_keys = get_keys_by_value(nl_dict,tag)
        if sc_keys:
            categories.extend(sc_keys)
        if nl_keys:
            categories.extend(nl_keys)
        if not sc_keys and not nl_keys:
            categories.append('unknown')
    for category in categories:
        semantic_non_semantic.extend(get_keys_by_value(semantic_dic,category))
    frequency_count = Counter(semantic_non_semantic)
    result = dict(frequency_count)
    return result
    

In [95]:
tagged_results['tag_array'] = tagged_results['tags'].apply(lambda x: convert_string_to_array(x))

In [142]:
tagged_results.head()

Unnamed: 0,rational_pos,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt,exp,token_type,span,tags,tag_count,tag_array,tag_semantic_array
0,0,def,0,,,,0,src,"(0, 3)","['module', 'function_definition', 'def']",,"[module, function_definition, def]","{'unknown': 1, 'sc_semantic': 2}"
1,1,skip,0,"[('def', 0.00029721998726017773)]",[0],[0.00029721998726017773],0,src,"(3, 8)","['module', 'function_definition']",,"[module, function_definition]","{'unknown': 1, 'sc_semantic': 1}"
2,2,_,0,"[(' skip', 0.3496493101119995)]",[1],[0.3496493101119995],0,src,"(8, 9)","['NN', 'module', 'function_definition', 'ident...",,"[NN, module, function_definition, identifier]","{'nl_semantic': 1, 'unknown': 1, 'sc_semantic'..."
3,3,on,0,"[('_', 0.001134732156060636), (' skip', 0.0059...","[2, 1, 0]","[0.001134732156060636, 0.0059091681614518166, ...",0,src,"(9, 11)","['NN', 'module', 'function_definition', 'ident...",,"[NN, module, function_definition, identifier]","{'nl_semantic': 1, 'unknown': 1, 'sc_semantic'..."
4,4,_,0,"[('on', 0.2272750586271286)]",[3],[0.2272750586271286],0,src,"(11, 12)","['NN', 'module', 'function_definition', 'ident...",,"[NN, module, function_definition, identifier]","{'nl_semantic': 1, 'unknown': 1, 'sc_semantic'..."


In [143]:
tagged_results['tag_semantic_array'] = tagged_results['tag_array'].apply(lambda x: tag_classification(x))
tagged_results['tag_count'] = tagged_results['tag_array'].apply(lambda x: len(x))

In [125]:
def sum_counters(counter_list):
    result_counter = Counter()
    for counter in counter_list:
        result_counter += counter
    return result_counter

In [130]:
taggeg_grouped = tagged_results.groupby('from_seq_id')['tag_semantic_array'].apply(sum_counters).reset_index()

In [131]:
taggeg_grouped

Unnamed: 0,from_seq_id,level_1,tag_semantic_array
0,0,unknown,46.0
1,0,sc_semantic,94.0
2,0,nl_semantic,20.0
3,0,sc_nl,21.0
4,0,sc_not_semantic,51.0
...,...,...,...
695,99,nl_semantic,26.0
696,99,sc_nl,26.0
697,99,sc_not_semantic,8.0
698,99,nl_not_semantic,5.0


In [134]:
tagged_grouped = taggeg_grouped.pivot(index='from_seq_id', columns='level_1', values='tag_semantic_array').reset_index()


In [145]:
tagged_total_counts = tagged_results.groupby('from_seq_id')['tag_count'].sum().reset_index()

In [138]:
selected_galeras = galeras_dataset.head(100)

In [147]:
result_df = pd.concat([selected_galeras,tagged_grouped, tagged_total_counts.iloc[:,1] ], axis=1)

In [151]:
result_df

Unnamed: 0,id,commit_id,repo,path,file_name,fun_name,commit_message,code,url,language,...,prompt,from_seq_id,nl_not_semantic,nl_semantic,sc_errors,sc_nl,sc_not_semantic,sc_semantic,unknown,tag_count
0,159555,f9437064b3501869f5f56fb9e6d345d81ffeec5e,rasa,tests/core/test_evaluation.py,test_evaluation.py,skip_on_CI,Skip e2e tests on CI: these tests take too lon...,def skip_on_CI() -> bool:\n \n return os...,https://github.com/RasaHQ/rasa.git,Python,...,def skip_on_CI() -> bool:\n \n return os...,0,6.0,20.0,9.0,21.0,51.0,94.0,46.0,240
1,276430,84afc5193d38057e2e2badf9c889ea87d80d8fbf,keras,keras/tests/custom_training_loop_test.py,custom_training_loop_test.py,call,Reformatting the codebase with black.\n\nPiper...,"def call(self, inputs):\n self.add_loss...",https://github.com/keras-team/keras.git,Python,...,"def call(self, inputs):\n self.add_loss...",1,14.0,22.0,44.0,22.0,76.0,47.0,48.0,258
2,210956,39531637b5675a36409c303db022bfab90939896,PaddleDetection,deploy/python/det_keypoint_unite_infer.py,det_keypoint_unite_infer.py,smoothing_factor,one euro filter and ema smoothing for keypoint...,"def smoothing_factor(self, te, fc):\n r...",https://github.com/PaddlePaddle/PaddleDetectio...,Python,...,"def smoothing_factor(self, te, fc):\n r...",2,9.0,20.0,44.0,20.0,41.0,63.0,53.0,240
3,190542,4fc3616712edb19179b17dd270ad6cf63abf99c2,DeOldify,fastai/vision/image.py,image.py,data,Upgrading to support latest Pytorch version,"def data(self)->TensorImage:\n ""Return ...",https://github.com/jantic/DeOldify.git,Python,...,"def data(self)->TensorImage:\n ""Return ...",3,8.0,20.0,19.0,24.0,14.0,61.0,51.0,189
4,189667,e040bcacd38378386749db18aeba575b93f4ebca,manim,manim/mobject/geometry/arc.py,arc.py,stop_angle,Improved structure of the :mod:`.mobject` modu...,def stop_angle(self):\n return angle_of...,https://github.com/ManimCommunity/manim.git,Python,...,def stop_angle(self):\n return angle_of...,4,8.0,23.0,5.0,23.0,63.0,139.0,46.0,298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,158193,b64b41d8c1ac23c43f7a4e3f9f6339d6f0012ab2,d2l-zh,d2l/mxnet.py,mxnet.py,forward,[PaddlePaddle] Merge master into Paddle branch...,"def forward(self, X):\n # `X` shape: (b...",https://github.com/d2l-ai/d2l-zh.git,Python,...,"def forward(self, X):\n # `X` shape: (b...",95,15.0,20.0,44.0,28.0,14.0,7.0,48.0,163
96,289189,bbe63bca4733b31b6f6cf29270cdd62765309fdf,core,homeassistant/components/switchbot/sensor.py,sensor.py,native_value,Bump pySwitchbot to 0.20.0 for bleak 0.19 chan...,def native_value(self) -> str | int | None:\n ...,https://github.com/home-assistant/core.git,Python,...,def native_value(self) -> str | int | None:\n ...,96,10.0,17.0,5.0,17.0,56.0,110.0,48.0,252
97,189952,ce9acddd41f39929d499b45e52621a2849b8333e,manim,tests/test_graphical_units/test_threed.py,test_threed.py,test_Cone,Improved performance of ``test_threed.py`` (#2...,def test_Cone(scene):\n scene.add(Cone(reso...,https://github.com/ManimCommunity/manim.git,Python,...,def test_Cone(scene):\n scene.add(Cone(reso...,97,8.0,23.0,22.0,23.0,43.0,66.0,69.0,245
98,161543,c80ad26d872efe19a8b8eca022489c241f10b9e2,rich,benchmarks/benchmarks.py,benchmarks.py,time_render_unicode_heavy,Add initial benchmark suite,def time_render_unicode_heavy(self):\n ...,https://github.com/Textualize/rich.git,Python,...,def time_render_unicode_heavy(self):\n ...,98,10.0,27.0,1.0,27.0,64.0,168.0,46.0,332


In [150]:
result_df.describe()

Unnamed: 0,id,n_ast_errors,ast_levels,n_whitespaces,n_words,vocab_size,complexity,nloc,token_counts,n_ast_nodes,n_identifiers,from_seq_id,nl_not_semantic,nl_semantic,sc_errors,sc_nl,sc_not_semantic,sc_semantic,unknown,tag_count
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,71.0,100.0,100.0,100.0,100.0,100.0
mean,182748.12,0.05,8.46,19.03,7.81,7.58,1.27,4.87,26.12,33.64,5.12,49.5,8.82,21.2,15.788732,23.37,29.02,94.33,55.0,233.5
std,97359.286552,0.261116,1.209558,6.653707,2.762282,2.391821,1.052702,6.559217,35.117201,7.967763,1.320239,29.011492,2.637645,3.874287,14.224642,5.892394,14.49206,35.740664,13.37569,37.334821
min,284.0,0.0,5.0,5.0,3.0,3.0,1.0,2.0,8.0,17.0,2.0,0.0,2.0,14.0,1.0,14.0,6.0,7.0,5.0,140.0
25%,118701.75,0.0,8.0,14.0,6.0,6.0,1.0,2.0,15.0,28.75,4.0,24.75,7.0,19.0,5.5,20.0,18.75,66.75,48.0,210.75
50%,188258.5,0.0,8.0,20.0,7.5,7.0,1.0,3.0,18.0,32.5,5.0,49.5,9.0,21.0,12.0,23.0,26.5,93.5,50.0,233.0
75%,277977.25,0.0,9.0,23.0,9.0,9.0,1.0,4.0,24.0,39.0,6.0,74.25,10.0,24.0,18.5,26.0,35.25,121.5,57.5,254.25
max,334797.0,2.0,12.0,36.0,19.0,16.0,8.0,34.0,274.0,64.0,9.0,99.0,17.0,32.0,62.0,55.0,76.0,186.0,99.0,332.0
