# Local Aggregations Module

In [1]:
def param_default():
    return {
        'model_name' : '/workspaces/code-rationales/data/codeparrot-small/checkpoints/checkpoint-29000', 
        'cache_dir': '/workspaces/code-rationales/datax/df_cache_dir',
        'delimiter_sequence': ''
    }
prompts = [
    """def bubbleSort(arr):
    n = len(arr)
    # optimize code, so if the array is already sorted, it doesn't need
    # to go through the entire process
    swapped = False
    # Traverse through all array elements
    for i in range(n-1):
        # range(n) also work but outer loop will
        # repeat one time more than needed.
        # Last i elements are already in place
        for j in range(0, n-i-1):
 
            # traverse the array from 0 to n-i-1
            # Swap if the element found is greater
            # than the next element
            if arr[j] > arr[j + 1]:
                swapped = True
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
         
        if not swapped:
            # if we haven't needed to make a single swap, we
            # can just exit the main loop.
            return"""
]

In [2]:
params = param_default()

## CORE

### Imports

In [3]:
from pathlib import Path
import csv
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools
import json
import nltk

pd.options.display.float_format = '{:.2f}'.format

In [4]:
from code_rationales.loader import download_grammars
from tree_sitter import Language, Parser
import code_rationales

In [5]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

2023-09-05 00:43:31.998833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-05 00:43:32.199699: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
import warnings
import importlib
from matplotlib import colors
import os

In [7]:
import sys
sys.path.insert(1, '/workspaces/code-rationales/sequential-rationales/huggingface')
from rationalization import rationalize_lm

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

### Setup

In [9]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

### Taxonomies

In [10]:
#Programming Language Taxonomy
def pl_taxonomy_python() -> dict:
    return {
  "punctuation": ['{', '}', '[', ']', '(', ')','\"', ',', '.', '...', ';', ':'], 
  "exceptions": ['raise_statement','catch', 'try', 'finally', 'throw', 'throws', 'except'],
  "oop": ['def','class','instanceof','interface','private','protected','public','abstract','extends','package','this','implements','import','new','super'],
  "asserts": ['assert'],
  "types": ['tuple','set','list','pair','subscript','type','none','dictionary','integer','native','static','synchronized','transient','volatile','void','final','enum','byte','char','float','boolean','double','int','long','short','strictfp'],
  "conditionals": ['else', 'if', 'switch', 'case', 'default'],
  "loops": ['break', 'do', 'for', 'while', 'continue'],
  "operators": ['as','yield','is','@','in','and','or','not','**','slice','%','+','<','>','=','+','-','*','/','%','++','--','!','==','!=','>=','<=','&&','||','?',':','~','<<','>>','>>>','&','^','|','//'],
  "indentation": ['\n','\t'],
  "bool": ['true', 'false'], 
  "functional":['lambda','lambda_parameters'],
  "with" : ['with','with_item','with_statement','with_clause'], 
  "return" :['return'],
  "structural" : ['attribute','module', 'argument_list','parenthesized_expression','pattern_list','class_definition','function_definition','block'],
  "statements" : ['return_statement','break_statement','assignment','while_statement','expression_statement','assert_statement'],
  "expression": ['call','exec','async','ellipsis','unary_operator','binary_operator','as_pattern_target','boolean_operator','as_pattern','comparison_operator','conditional_expression','named_expression','not_operator','primary_expression','as_pattern'],
  "errors": ["ERROR"],
  "identifier":["identifier"],  
  "comment":["comment"],
  "string": ['string','interpolation','string_content','string_end','string_start','escape_sequence'], 
  "unknown": []
}

In [11]:
def nl_pos_taxonomy() -> dict: return {
    "nl_verb" : ['VBN', 'VBG', 'VBZ', 'VBP', 'VBD', 'VB'],
    "nl_noun" : ['NN', 'NNPS', 'NNS', 'NNP'],
    "nl_pronoun" : ['WP', 'PRP', 'PRP$', 'WP','WP$'], 
    "nl_adverb" : ['RBS','RBR', 'RB', 'WRB'], 
    "nl_adjetive" : ['JJR', 'JJS', 'JJ'], 
    "nl_determier" : ['DT','WDT','PDT'], 
    "nl_preposition" : ['IN', 'TO'],
    "nl_particle" : ['RP'],
    "nl_modal" : ['MD'],
    "nl_conjunction" : ['CC'],
    "nl_cardinal" : ['CD'],
    "nl_list": ['LS'],
    "nl_other" : ['FW', 'EX', 'SYM' , 'UH', 'POS', "''", '--',':', '(', ')', '.', ',', '``', '$']
}

### AST Mapping

In [12]:
def unroll_node_types(
    nested_node_types: dict  # node_types from tree-sitter
) -> list: # list of node types
    def iterate_and_unroll_dict(nested_node_types: dict, all_node_types: set):
        for key, value in nested_node_types.items():
            if key == 'type' and type(value) == str:
                all_node_types.add(value)
            if type(value) == dict:
                iterate_and_unroll_dict(value, all_node_types)
            if type(value) == list:
                for element in value:
                    iterate_and_unroll_dict(element, all_node_types) 
    all_node_types = set()
    for dictionary in nested_node_types:
        iterate_and_unroll_dict(dictionary, all_node_types)
    all_node_types.add('ERROR')
    return list(all_node_types)

In [13]:
def create_parser(lang: str):
    # Grab the node types from the tree-sitter language
    language = Language(f"{code_rationales.__path__[0]}/grammars/tree-sitter-languages.so", lang)
    node_path = f"{code_rationales.__path__[0]}/grammars/tree-sitter-{lang}/src/node-types.json"
    with open(node_path) as f:
            node_types = json.load(f)
    node_types = unroll_node_types(node_types)
    # Create a parser for the language
    parser = Parser()
    parser.set_language(language)
    return parser, node_types

In [14]:
def traverse(
    node,       # tree-sitter node
) -> None:
    """Traverse in a recursive way, a tree-sitter node and append results to a list."""
    results = []
    def traverse_tree(node, results):
        if node.type == 'string':
            results.append(node)
            return
        for n in node.children:
            traverse_tree(n, results)
        if not node.children:
            results.append(node)
    traverse_tree(node, results)
    return results

In [15]:
def convert_to_offset(
    point,              #point to convert
    lines: list         #list of lines in the source code
    ):
        """Convert the point to an offset"""
        row, column = point
        chars_in_rows = sum(map(len, lines[:row])) + row
        chars_in_columns = len(lines[row][:column])
        offset = chars_in_rows + chars_in_columns
        return offset

In [16]:
def get_node_span(node, lines):
    """Get the span position of the node in the code string"""
    start_span = convert_to_offset(node.start_point, lines)
    end_span = convert_to_offset(node.end_point, lines)
    return start_span, end_span
    

In [17]:
def is_token_span_in_node_span(tok_span, node_span):
    return node_span[0] <= tok_span[0] and tok_span[1] <= node_span[1]

In [18]:
def get_token_type(
    tok_span: tuple, # (start, end) position of a token in tokenizer
    nodes: list,     # list of tree-sitter nodes
    lines: list,     # list of lines in the code
) -> tuple: # (parent_type, token_type) of the token
    """Get the parent AST type and token AST type of a token."""
    node_spans = [get_node_span(node, lines) for node in nodes]
    for i, span in enumerate(node_spans):
        if is_token_span_in_node_span(tok_span, span):
            return nodes[i].parent.type, nodes[i].type

In [19]:
def get_token_nodes(
    tok_span: tuple, # (start, end) position of a token in tokenizer
    node,            # tree-sitter node
    lines: list,     # list of lines in the code
) -> list: 
    """Get all AST types for the given token span"""
    results = []
    def traverse_and_get_types(tok_span, node, lines, results) -> None:
        node_span = get_node_span(node, lines)
        if is_token_span_in_node_span(tok_span, node_span):
            results.append(node)
        for n in node.children:
            traverse_and_get_types(tok_span, n, lines, results)
    traverse_and_get_types(tok_span, node, lines, results)
    return results

In [20]:
def get_nodes_by_type(
    node, 
    node_types: list
) -> list :
    def traverse_and_search(node, node_types, results):
        if node.type in node_types:
            results.append(node)
            return
        for n in node.children:
            traverse_and_search(n, node_types ,results)
    results = []
    traverse_and_search(node, node_types, results)
    return results

### Taxonomy Mapping

In [21]:
def clean_results(global_results):
    def clean_dictonary(result_dict):
        clean_dict = result_dict.copy()
        for key, value in result_dict.items():
            if not value or not value['values']: 
                clean_dict.pop(key)
        return clean_dict
    for key, value in global_results.items():
        global_results[key] = clean_dictonary(value)
    return global_results

In [22]:
def search_category_by_token(taxonomy_dict: dict, token_type: str):
    for key, value in taxonomy_dict.items():
        if token_type in value:
            return key
    return 'unknown'

In [23]:
def map_to_taxonomy(taxonomy_dict: dict, result_dict: dict):
    result_dict = result_dict.copy()
    mappings = {token: {category : {'values': [], 'rationales': []} for category in taxonomy_dict.keys()} for token in result_dict.keys()}
    for target_token, value in result_dict.items():
        for source_token, props in value.items():
            mappings[target_token][search_category_by_token(taxonomy_dict, source_token)]['values'].append(props['values'])
            mappings[target_token][search_category_by_token(taxonomy_dict, source_token)]['rationales'].append(props['rationales'])
    return clean_results(mappings)

In [24]:
def map_local_results_to_taxonomy(taxonomy_dict:dict, local_results: dict):
    return dict(zip(local_results.keys(), map(lambda aggegrations: map_to_taxonomy(taxonomy_dict, aggegrations), local_results.values())))

### Model Sampling Generation

In [25]:
def df_sampled_generation(
        df_sampled_code, 
        model,
        tokenizer,
        number_samples_generation = 1,
        max_gen_tok = 100, 
        top_k = 0
    ):
    dict_generated_code = {i: [] for i in range(number_samples_generation)}
    for idx_prompt, prompt in enumerate(df_sampled_code['prompt']):
        input = tokenizer([prompt], return_tensors="pt")
        input.to(model.device)
        outputs = model.generate(**input, do_sample=True,
                                 max_length=len(df_sampled_code['input_ids'][idx_prompt]), ##Force rationalization
                                 top_k=top_k, 
                                 num_return_sequences=number_samples_generation, 
                                 pad_token_id=tokenizer.eos_token_id)
        for index, output in enumerate(outputs):
            dict_generated_code[index].append(output.tolist())
    df_temp = pd.DataFrame().from_dict(data=dict_generated_code) # DataFrame from Generation
    df_temp = pd.concat([df_sampled_code.reset_index(), df_temp ], axis=1) #Index before concating
    return df_temp

### Running Rationales

In [26]:
#If the model is not fine-tuned or compatible, it will rise an error
#This function works for one tensor of source token and one tensor of target tokens
def rationalize_model(model, tokenizer, input_ids, max_token_size: int, verbose=True):
    torch.cuda.empty_cache() #Cleaning Cache
    all_rationales, log = rationalize_lm(
        model = model,
        input_ids = input_ids[:max_token_size],
        tokenizer = tokenizer,
        verbose = verbose,
        max_steps=1024 #Max number of steps for greedy rationalization
    )
    return all_rationales, log 

In [27]:
def run_multiple_rational(
    model,
    tokenizer, 
    arr_target_tokens, 
    seq_id, #mapping sequence id
    max_token_size,
    verbose=True
):
    arr_log = []
    for index, val in enumerate(arr_target_tokens):
        all_rationales, log = rationalize_model(
            model=model, 
            tokenizer=tokenizer, 
            input_ids=val,
            max_token_size=max_token_size,
            verbose=False
        )
        arr_log.append(log)
    arr_code_rationales = [ log['rationalization'] for log in arr_log ] #extracting just rationalizations
    arr_from_sentence = [ list(np.full( len(val), seq_id[arr_i] )) #arr_i maps to the real sequence id
                            for arr_i, val in enumerate(arr_code_rationales)]
    arr_code_rationales = sum( arr_code_rationales, [] ) #flatting
    arr_from_sentence = sum( arr_from_sentence, [] ) #flatting
    return arr_code_rationales, arr_from_sentence

In [28]:
def pandas_rationales( arr_code_rationales, arr_from_sentence ):
    #Creating pandas_1 {p_rationale}
    rational = lambda list_log,typeset: [ (dict_tok['added_token_text'],round(dict_tok['true_token_prob'],6)) for dict_tok in list_log if dict_tok['from']==typeset]
    log = lambda log_row: [(log_dict['added_token_text'],log_dict['true_token_prob']) for log_dict in log_row] #Typeset

    log_position = lambda log_row: [log_dict['added_token_position'] for log_dict in log_row] #Position of the Rationale
    log_prediction = lambda log_row: [log_dict['true_token_prob'] for log_dict in log_row] #Rationale Prob

    p_rationale = pd.DataFrame()

    p_rationale['goal_token'] = [dict_token['goal_word'] for dict_token in arr_code_rationales]
    p_rationale['from_seq_id'] = arr_from_sentence

    p_rationale['typesets_tgt'] = [ log(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    
    p_rationale['rationale_pos_tgt'] = [ log_position(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    p_rationale['rationale_prob_tgt'] = [ log_prediction(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]


    return p_rationale

In [29]:
#Running Rationalization
def run_code_rational( 
        df_generated_input,
        tensor_size, #Control the size of the experiment, 
        model,
        tokenizer,
        experiment = '5',
        batch_size = 100, 
        max_token_size = 44,
        verbose = True 
    ):

    arr_rationals = []
    arr_from_seq = []

    for i in range( 0 , tensor_size , batch_size ):
        print('************************' + str(i) + '************************')
        t_generated_input = df_generated_input[experiment].values[i:i+batch_size]
        t_generated_input = [ torch.tensor(s).to(model.device) for s in t_generated_input]

        t_arr_rationals,t_arr_from_seq = run_multiple_rational(
            model = model,
            tokenizer = tokenizer,
            arr_target_tokens =  t_generated_input, 
            seq_id = list(range(i,i+batch_size)),
            max_token_size = len(t_generated_input[0]),
            verbose = verbose
        )

        arr_rationals = arr_rationals + t_arr_rationals
        arr_from_seq = arr_from_seq + t_arr_from_seq

        torch.cuda.empty_cache() #Cleaning Cache
        
    print("Experiment Finished: " + str(experiment))
    return pandas_rationales( arr_rationals, arr_from_seq )

In [30]:
def run_code_rational_all_set(exp, df_generated_input, model, tokenizer, tensor_n = 100, BATCH = 10): #When Tensor_n and batch differs then 'from_seq_id' is lost
    torch.cuda.empty_cache() #Cleaning Cache
    EXP = exp
    test_arr_rationals = run_code_rational( 
            df_generated_input,
            tensor_n,
            model, 
            tokenizer,
            experiment = EXP,
            batch_size = BATCH,
            verbose = False 
        )
    #Saving process
    return test_arr_rationals


### Rationales Tagging

In [31]:
calculate_right_span = lambda start_idx, end_idx, initial_token, df : len(initial_token + ''.join(map(str, df.loc[start_idx:end_idx, 'goal_token'].tolist())))
calculate_span = lambda right_span, token : (right_span-len(str(token)), right_span)

In [32]:
def add_auxiliary_columns_to_experiment_result(df, delimiter_sequence: str):
    initial_token = df['typesets_tgt'][0][0][0]
    ### TOKEN TYPE COLUMN
    token_type_column = ['src'] * len(df)
    sequence = initial_token
    for idx, goal_token in enumerate(df['goal_token']):
        if delimiter_sequence not in sequence:
            token_type_column[idx] = 'nl'
            sequence+=goal_token
    df['token_type'] = token_type_column
    ### TOKEN SPAN COLUMN - CHECK FOR DATASETS
    src_initial_token_idx = df[df['token_type']== 'src'].first_valid_index()
    df['span'] = [None] * len(df[:src_initial_token_idx]) + [calculate_span(calculate_right_span(src_initial_token_idx, index, initial_token, df), token) for index, token in df[src_initial_token_idx:]['goal_token'].items()]


In [33]:
def fill_nl_tags_in_experiment_result(df, nl_ast_types, nl_pos_types, parser):
    initial_token = df['typesets_tgt'][0][0][0]
    ##### POS TAGS FOR NL PART
    target_nl = initial_token + ''.join(df[df['token_type'] == 'nl']['goal_token'].map(lambda value: str(value)))
    pos_tags = nltk.pos_tag(nltk.word_tokenize(target_nl))
    for idx in range(df[df['token_type']== 'src'].first_valid_index()):
        nl_tags = list(map(lambda tag: tag[1] if tag[1] in nl_pos_types else None, filter(lambda tag: tag[0] in str(df['goal_token'][idx]), pos_tags)))
        if nl_tags: df.at[idx, 'tags'] = df['tags'][idx] + [nl_tags[-1]]
    ##### POS TAGS FOR CODE PART
    target_code = ''.join(df[df['token_type'] == 'src']['goal_token'].map(lambda value: str(value)))
    nl_target_nodes = get_nodes_by_type(parser.parse(bytes(target_code, 'utf8')).root_node, nl_ast_types)
    for token_idx in range(df[df['token_type'] == 'src'].first_valid_index(), len(df['span'])):
                for nl_target_node in nl_target_nodes:
                    if is_token_span_in_node_span(df['span'][token_idx], get_node_span(nl_target_node, target_code.split("\n"))) and \
                            str(df['goal_token'][token_idx]) in nl_target_node.text.decode('utf-8'):
                            tagged_token_list = list(filter(lambda tagged_token: tagged_token[0] in str(df['goal_token'][token_idx]), \
                                                        nltk.pos_tag( nltk.word_tokenize(nl_target_node.text.decode('utf-8')))))
                            if len(tagged_token_list)>0 and tagged_token_list[0][1] in nl_pos_types and tagged_token_list[0][1] not in df['tags'][token_idx]: df.at[token_idx, 'tags'] = df['tags'][token_idx] + [tagged_token_list[0][1]]

In [34]:
def fill_ast_tags_in_experiment_result(df, parser):
    initial_token = df['typesets_tgt'][0][0][0] if df[df['token_type'] == 'src'].first_valid_index() == 0 else ''
    target_code = initial_token + ''.join(df[df['token_type'] == 'src']['goal_token'].map(lambda value: str(value)))
    src_initial_token_idx = df[df['token_type'] == 'src'].first_valid_index()
    target_ast = parser.parse(bytes(target_code, 'utf8')).root_node
    for token_idx in range(src_initial_token_idx, len(df)):
        df.at[token_idx, 'tags'] = df['tags'][token_idx] + list(map(lambda node: node.type, get_token_nodes(df['span'][token_idx], target_ast, target_code.split("\n"))))

In [35]:
def tag_rationals(experiment_results: list, nl_ast_types: list, nl_pos_types: list, delimiter_sequence: str, parser):
    experiments = {}
    for exp_idx, df_experiment in enumerate(experiment_results):
        experiment_results = []
        experiment_rational_results = [df_experiment[(df_experiment['from_seq_id'] == sample_idx) | \
                                                     (df_experiment['from_seq_id'] == str(sample_idx))].reset_index() \
                                                    for sample_idx in range(len(prompts))]
        print('*'*10 +'Tagging rationals for exp: ' +str(exp_idx) + '*'*10)
        for experiment_rational_result in experiment_rational_results:
            add_auxiliary_columns_to_experiment_result(experiment_rational_result, delimiter_sequence)
            experiment_rational_result['tags'] = [[]]*len(experiment_rational_result)
            fill_nl_tags_in_experiment_result(experiment_rational_result, nl_ast_types, nl_pos_types, parser)
            fill_ast_tags_in_experiment_result(experiment_rational_result, parser)
            experiment_results.append(experiment_rational_result)
        experiments[exp_idx] = experiment_results
    return experiments
            

### Rationales Aggregation

In [36]:
def aggregate_rationals(global_tagged_results: dict, ast_node_types: list, nl_pos_types: list, number_samples: int):
    aggregation_results = {sample_id: None  for sample_id in range(number_samples)}
    for exp_idx, experiment_results in global_tagged_results.items():
        print('*'*10 +'Aggregrating rationals for exp: ' +str(exp_idx) + '*'*10)
        for experiment_result in experiment_results:
            sample_results = {str(pos)+'['+str(token)+']' : {node_type : {'values': [], 'rationales': []} for node_type in ast_node_types + nl_pos_types} for pos, token in enumerate(experiment_result['goal_token'].tolist())}
            for target_idx, target_token in enumerate(experiment_result['goal_token'].tolist()):
                for rational_idx, rational_pos in enumerate(experiment_result['rationale_pos_tgt'][target_idx]):
                    for rational_tag in experiment_result['tags'][rational_pos]:
                        if rational_tag:
                            try:
                                sample_results[str(target_idx)+'['+str(target_token)+']'][rational_tag]['values'].append(experiment_result['rationale_prob_tgt'][target_idx][rational_idx])
                                sample_results[str(target_idx)+'['+str(target_token)+']'][rational_tag]['rationales'].append(str(rational_pos)+'['+str(experiment_result['goal_token'][rational_pos])+']')
                            except Exception as e:
                                print('An Error Occurred')
            aggregation_results[experiment_result['from_seq_id'].unique()[0]] = clean_results(sample_results)
    return aggregation_results

## LOCAL EXPERIMENT

### Parsing

In [37]:
### Define parser
parser, node_types = create_parser('python')
### Defines pos tags 
pos_types = list(nltk.data.load('help/tagsets/upenn_tagset.pickle'))

### Model, Tokenizer Loading

In [38]:
model = AutoModelForCausalLM.from_pretrained(params['model_name'], cache_dir=params['cache_dir'])
tokenizer = AutoTokenizer.from_pretrained(params['model_name'])
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


### Local Dataset Creation

In [39]:
df_sampled_code = pd.DataFrame(prompts, columns=['prompt'])
df_sampled_code['input_ids'] = tokenizer(df_sampled_code['prompt'].tolist())['input_ids']

### Execute

In [40]:
### SAMPLING GENERATION 
df_generated_input = df_sampled_generation(
    df_sampled_code=df_sampled_code, 
    model=model,
    tokenizer=tokenizer, 
    number_samples_generation=1,
    max_gen_tok=0)

Input length of input_ids is 205, but ``max_length`` is set to 205.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [41]:
### GET RATIONALES
experiment_results = []
for i in df_generated_input.columns[3:]: #Only Generated Sequences 
    experiment_result = run_code_rational_all_set(df_generated_input=df_generated_input, exp=i, tensor_n=df_generated_input.shape[0],model=model, tokenizer=tokenizer, BATCH=10)
    experiment_result['exp'] = i
    experiment_results.append(experiment_result)
df_experiment_results = pd.concat(experiment_results)

************************0************************
Experiment Finished: 0


In [42]:
###TAG EXPERIMENTS RESULTS - TAKES TIME
nl_ast_types = ['comment','identifier','string']
tagged_results = tag_rationals([df_experiment_results], nl_ast_types, pos_types, params['delimiter_sequence'], parser)

**********Tagging rationals for exp: 0**********


In [47]:
tagged_results[0][0].head(30)

Unnamed: 0,index,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt,exp,token_type,span,tags
0,0,b,0,"[(def, 0.0011881806422024965)]",[0],[0.0011881806422024965],0,src,"(3, 5)","[module, function_definition]"
1,1,ubble,0,"[( b, 0.00011519427062012255), (def, 0.0047983...","[1, 0]","[0.00011519427062012255, 0.0047983708791434765]",0,src,"(5, 10)","[module, function_definition, identifier]"
2,2,Sort,0,"[(ubble, 2.473087442922406e-05), (def, 5.48147...","[2, 0, 1]","[2.473087442922406e-05, 5.4814721806906164e-05...",0,src,"(10, 14)","[module, function_definition, identifier]"
3,3,(,0,"[(Sort, 0.08000067621469498), (def, 0.17919571...","[3, 0]","[0.08000067621469498, 0.17919571697711945]",0,src,"(14, 15)","[module, function_definition, parameters, (]"
4,4,arr,0,"[((, 0.00025080275372602046), ( b, 0.001643841...","[4, 1, 0, 3, 2]","[0.00025080275372602046, 0.0016438415041193366...",0,src,"(15, 18)","[module, function_definition, parameters, iden..."
5,5,):,0,"[(arr, 0.0283211600035429), (def, 0.2197402417...","[5, 0, 4, 1, 3, 2]","[0.0283211600035429, 0.21974024176597595, 0.37...",0,src,"(18, 20)","[module, function_definition]"
6,6,\n,0,"[():, 0.1421736180782318), (def, 0.31778594851...","[6, 0, 1]","[0.1421736180782318, 0.31778594851493835, 0.65...",0,src,"(20, 24)","[module, function_definition]"
7,7,n,0,"[(\n , 0.0015395700465887785), (arr, 0.00421...","[7, 5, 1, 2, 0, 4, 3, 6]","[0.0015395700465887785, 0.004216337576508522, ...",0,src,"(24, 26)","[module, function_definition]"
8,8,=,0,"[( n, 0.02526974305510521), ():, 0.12217878550...","[8, 6, 0]","[0.02526974305510521, 0.12217878550291061, 0.3...",0,src,"(26, 28)","[module, function_definition, block, expressio..."
9,9,len,0,"[( =, 0.0038660855498164892), ( n, 0.035826124...","[9, 8, 7, 5, 3]","[0.0038660855498164892, 0.035826124250888824, ...",0,src,"(28, 32)","[module, function_definition, block, expressio..."


In [44]:
###AGGREGATE RATIONALS - AST
local_ast_aggregated_results = aggregate_rationals(tagged_results, node_types, pos_types, len(prompts))

**********Aggregrating rationals for exp: 0**********


In [45]:
###AGGREGATE RATIONALS - TAXONOMY
taxonomy = {**pl_taxonomy_python(), **nl_pos_taxonomy()}
local_taxonomy_aggregated_results = map_local_results_to_taxonomy(taxonomy, local_ast_aggregated_results)

### Visualize - AST Aggregation 

In [49]:
#local_ast_aggregated_results[<sample_id>][<pos[token]>] -> aggregated rationales
print(local_ast_aggregated_results[0].keys()) #target tokens
print(local_ast_aggregated_results[0]['1[ubble]'].keys()) #rationales
print(local_ast_aggregated_results[0]['1[ubble]']['function_definition']['rationales']) #rationales values


dict_keys(['0[ b]', '1[ubble]', '2[Sort]', '3[(]', '4[arr]', '5[):]', '6[\n   ]', '7[ n]', '8[ =]', '9[ len]', '10[(]', '11[arr]', '12[)]', '13[\n   ]', '14[ #]', '15[ optimize]', '16[ code]', '17[,]', '18[ so]', '19[ if]', '20[ the]', '21[ array]', '22[ is]', '23[ already]', '24[ sorted]', '25[,]', '26[ it]', '27[ doesn]', "28['t]", '29[ need]', '30[\n   ]', '31[ #]', '32[ to]', '33[ go]', '34[ through]', '35[ the]', '36[ entire]', '37[ process]', '38[\n   ]', '39[ swapped]', '40[ =]', '41[ False]', '42[\n   ]', '43[ #]', '44[ Tra]', '45[verse]', '46[ through]', '47[ all]', '48[ array]', '49[ elements]', '50[\n   ]', '51[ for]', '52[ i]', '53[ in]', '54[ range]', '55[(]', '56[n]', '57[-]', '58[1]', '59[):]', '60[\n       ]', '61[ #]', '62[ range]', '63[(]', '64[n]', '65[)]', '66[ also]', '67[ work]', '68[ but]', '69[ outer]', '70[ loop]', '71[ will]', '72[\n       ]', '73[ #]', '74[ repeat]', '75[ one]', '76[ time]', '77[ more]', '78[ than]', '79[ needed]', '80[.]', '81[\n       ]', '

### Visualize - Taxonomy Aggregation 

In [51]:
#local_ast_aggregated_results[<sample_id>][<pos[token]>] -> aggregated rationales
print(local_taxonomy_aggregated_results[0].keys()) #target tokens
print(local_taxonomy_aggregated_results[0]['1[ubble]'].keys()) #rationales
print(local_taxonomy_aggregated_results[0]['1[ubble]']['identifier']) #rationales values



dict_keys(['0[ b]', '1[ubble]', '2[Sort]', '3[(]', '4[arr]', '5[):]', '6[\n   ]', '7[ n]', '8[ =]', '9[ len]', '10[(]', '11[arr]', '12[)]', '13[\n   ]', '14[ #]', '15[ optimize]', '16[ code]', '17[,]', '18[ so]', '19[ if]', '20[ the]', '21[ array]', '22[ is]', '23[ already]', '24[ sorted]', '25[,]', '26[ it]', '27[ doesn]', "28['t]", '29[ need]', '30[\n   ]', '31[ #]', '32[ to]', '33[ go]', '34[ through]', '35[ the]', '36[ entire]', '37[ process]', '38[\n   ]', '39[ swapped]', '40[ =]', '41[ False]', '42[\n   ]', '43[ #]', '44[ Tra]', '45[verse]', '46[ through]', '47[ all]', '48[ array]', '49[ elements]', '50[\n   ]', '51[ for]', '52[ i]', '53[ in]', '54[ range]', '55[(]', '56[n]', '57[-]', '58[1]', '59[):]', '60[\n       ]', '61[ #]', '62[ range]', '63[(]', '64[n]', '65[)]', '66[ also]', '67[ work]', '68[ but]', '69[ outer]', '70[ loop]', '71[ will]', '72[\n       ]', '73[ #]', '74[ repeat]', '75[ one]', '76[ time]', '77[ more]', '78[ than]', '79[ needed]', '80[.]', '81[\n       ]', '