# Rationalization @ Global Granularity
> GPT-2 based global rationalization

In [1]:
from pathlib import Path
import csv
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools

pd.options.display.float_format = '{:.2f}'.format

In [2]:
from sacrebleu.metrics import BLEU

In [3]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

2023-07-13 20:49:06.515351: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-13 20:49:06.720195: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import warnings
import importlib
from matplotlib import colors
import os

In [5]:
import sys
sys.path.insert(1, '/workspaces/code-rationales/sequential-rationales/huggingface')
from rationalization import rationalize_lm

In [6]:
def param_default():
    return {
        'model_name' : '/workspaces/code-rationales/data/codeparrot-small/checkpoints/checkpoint-29000', 
        'cache_dir': '/workspaces/code-rationales/datax/df_cache_dir',
        #'dataset' : 'code_completion_random_cut_5k_30_512_tokens',
        #'dataset' : 'code_completion_docstring_random_cut_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_signature_3.8k_30_150_tokens',
        #'dataset' : 'code_completion_docstring_5k_30_150_tokens',
        'dataset' : 'code_completion_docstring_signature_5k_30_512_tokens',
        'sampling_results': '/workspaces/code-rationales/data/sampling/gpt',
        'rational_results': '/workspaces/code-rationales/data/rationales/gpt',
    }

In [7]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Model Loading and Testing

In [8]:
model = AutoModelForCausalLM.from_pretrained(
            param_default()['model_name'],
            cache_dir=param_default()['cache_dir'])

In [9]:
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


## Tokenizer Loading and Testing

In [10]:
tokenizer = AutoTokenizer.from_pretrained(param_default()['model_name'])

## Data Loading and Testing

In [11]:
#Loading Code Generation
df_generated_input = pd.read_csv( param_default()['sampling_results'] + '/' + param_default()['dataset'] +'.csv', index_col=0)

In [12]:
df_generated_input.columns[5:] #Tensor Columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29'],
      dtype='object')

In [13]:
df_generated_input.head()

Unnamed: 0,index,prompt,ground_truth,size,input_ids,0,1,2,3,4,...,20,21,22,23,24,25,26,27,28,29
0,0,Generate Pyhton code that Tests checkboxes but...,Tests checkboxes but also acts a regression te...,50,"[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...",...,"[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ...","[6864, 1611, 517, 265, 1233, 626, 6496, 1104, ..."
1,1,Generate Pyhton code that \n Factory me...,\n Factory method to produce an instanc...,45,"[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...",...,"[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,...","[6864, 1611, 517, 265, 1233, 626, 4960, 23616,..."
2,2,Generate Pyhton code that True if this Entry h...,True if this Entry has references from any App...,52,"[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...",...,"[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64...","[6864, 1611, 517, 265, 1233, 626, 715, 340, 64..."
3,3,Generate Pyhton code that Set packet parent.\n...,Set packet parent.\n When packet is an ...,52,"[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...",...,"[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 2494, 3644, ..."
4,4,Generate Pyhton code that Remove packet parent...,Remove packet parent.\n When packet is ...,52,"[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...",...,"[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ...","[6864, 1611, 517, 265, 1233, 626, 5852, 3644, ..."


In [14]:
df_generated_input.shape

(100, 35)

In [15]:
#tst decoding
decoded = tokenizer.decode(eval(df_generated_input['0'][1]))
decoded

'Generate Pyhton code that \n        Factory method to produce an instance of this class using the default kube config location\n         and signature is def from_environment(cls):\n            with open(os.environ[\'WORKER_CRAMDOOT\'], \'r\') as f:\n                f.read() \n    from_configuration = Factory({\n        \n       \'version\': u\'\'\n        })\n    from_config.from_config_path()\n\n    from_context_manager = Factory(dict(\n        uuid\n        u\'username\': u\'berry\',\n        u\'password\': u\'password\'\n    ),\n        dict(username = u\'berry\', password = u\'pass\'\n    ),\n        u\'BOOKE_API_KEY\': oxo_site_config["BOOKE_API_KEY"]\n    )\n\n    return HttpContext(options_dict(\n        filtered_services=["requests", \n                             config_manager, \n                             config_loader,\n                             config_finders,\n                             config_finds_dir,\n                            config_manager_is_imported,\n 

## Running Rationales

In [16]:
#Statistics
np.mean( [len(eval(i)) for i in df_generated_input['0'].values] )

464.0

In [17]:
#TODO Run the distribution of each experiment. The mean value of tokens or size for each experiment. 
np.mean( [len(eval(i)) for i in df_generated_input['input_ids'].values] )

88.12

In [18]:
len(df_generated_input['0'].values[1])

2392

In [19]:
MAX_TOKEN_SIZE = df_generated_input['size'].max() #Hardocoded!!

In [20]:
#If the model is not fine-tuned or compatible, it will rise an error
#This function works for one tensor of source token and one tensor of target tokens
def rationalize_model(model, tokenizer, input_ids, verbose=True):
    all_rationales, log = rationalize_lm(
        model = model,
        input_ids = input_ids[:MAX_TOKEN_SIZE],
        tokenizer = tokenizer,
        verbose = verbose,
        max_steps=1024 #Max number of steps for greedy rationalization
    )
    return all_rationales, log 

In [21]:
#tst <------- Test Case 2
def tst_rationalize_model():
    torch.cuda.empty_cache() #Cleaning Cache
    #WARNING TIME CONSUMING
    all_rationales, log = rationalize_model(
        model=model, 
        tokenizer=tokenizer, 
        input_ids=torch.tensor(eval(df_generated_input['0'][0])).to(model.device),
        verbose=False
    )
    pass
tst_rationalize_model()


In [22]:
def run_multiple_rational(
    model,
    tokenizer, 
    arr_target_tokens, 
    seq_id, #mapping sequence id
    verbose=True
):
    arr_log = []
    for index, val in enumerate(arr_target_tokens):
        all_rationales, log = rationalize_model(
            model=model, 
            tokenizer=tokenizer, 
            input_ids=val,
            verbose=False
        )
        arr_log.append(log)
    arr_code_rationales = [ log['rationalization'] for log in arr_log ] #extracting just rationalizations
    arr_from_sentence = [ list(np.full( len(val), seq_id[arr_i] )) #arr_i maps to the real sequence id
                            for arr_i, val in enumerate(arr_code_rationales)]
    arr_code_rationales = sum( arr_code_rationales, [] ) #flatting
    arr_from_sentence = sum( arr_from_sentence, [] ) #flatting
    return arr_code_rationales, arr_from_sentence

In [23]:
import gc

In [24]:
#tst <------- Test Case 2
def tst_run_multiple_rationa():
    gc.collect()
    torch.cuda.empty_cache() #Cleaning Cache
    t_dict_generated_input = { exp : [ torch.tensor(eval(s)).to(model.device) for 
                s in df_generated_input[exp].values ] for exp in df_generated_input.columns[5:]  }
    
    arr_rations, seq_id = run_multiple_rational(
        model = model,
        tokenizer = tokenizer,
        arr_target_tokens =  t_dict_generated_input['0'][:2], 
        seq_id = list( range(2,4) ),
        verbose = False
        )
    return arr_rations, seq_id
tst_arr_rations, seq_id = tst_run_multiple_rationa()

In [25]:
def pandas_rationales( arr_code_rationales, arr_from_sentence ):
    #Creating pandas_1 {p_rationale}
    rational = lambda list_log,typeset: [ (dict_tok['added_token_text'],round(dict_tok['true_token_prob'],6)) for dict_tok in list_log if dict_tok['from']==typeset]
    log = lambda log_row: [(log_dict['added_token_text'],log_dict['true_token_prob']) for log_dict in log_row] #Typeset

    log_position = lambda log_row: [log_dict['added_token_position'] for log_dict in log_row] #Position of the Rationale
    log_prediction = lambda log_row: [log_dict['true_token_prob'] for log_dict in log_row] #Rationale Prob

    p_rationale = pd.DataFrame()

    p_rationale['goal_token'] = [dict_token['goal_word'] for dict_token in arr_code_rationales]
    p_rationale['from_seq_id'] = arr_from_sentence

    p_rationale['typesets_tgt'] = [ log(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    
    p_rationale['rationale_pos_tgt'] = [ log_position(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]
    p_rationale['rationale_prob_tgt'] = [ log_prediction(log_row) for log_row in [dict_token['log'] for dict_token in arr_code_rationales]]


    return p_rationale

In [26]:
#Running Rationalization
def run_code_rational( 
        df_generated_input,
        tensor_size, #Control the size of the experiment
        experiment = '5',
        batch_size = 100, 
        model = model, 
        verbose = True 
    ):

    arr_rationals = []
    arr_from_seq = []

    for i in range( 0 , tensor_size , batch_size ):
        print('************************' + str(i) + '************************')
        t_generated_input = df_generated_input[experiment].values[i:i+batch_size]
        t_generated_input = [ torch.tensor(eval(s)).to(model.device) for s in t_generated_input]

        t_arr_rationals,t_arr_from_seq = run_multiple_rational(
            model = model,
            tokenizer = tokenizer,
            arr_target_tokens =  t_generated_input, 
            seq_id = list(range(i,i+batch_size)),
            verbose = verbose
        )

        arr_rationals = arr_rationals + t_arr_rationals
        arr_from_seq = arr_from_seq + t_arr_from_seq

        gc.collect()
        torch.cuda.empty_cache() #Cleaning Cache

    #keys_tensor = list( dict_generated_input.keys() )
    #keys_tensor = keys_tensor[:1] #HardCoded Ratios
    #dict_arr_rations = { key : for key in keys_tensor}
    #torch.cuda.empty_cache() #Cleaning Cache
    print("Experiment Finished: " + experiment)
    return pandas_rationales( arr_rationals, arr_from_seq )

In [27]:
#tst
def tst_run_code_rational_sampling_set(exp='0'):
    gc.collect()
    torch.cuda.empty_cache()
    tensor_n = 3 #df_generated_input.shape[0]
    EXP = exp
    BATCH = 1
    test_arr_rationals = run_code_rational( 
            df_generated_input = df_generated_input.sample( n = tensor_n, replace = False, random_state=2),
            tensor_size = tensor_n,
            experiment = EXP,
            batch_size = BATCH, 
            model = model, 
            verbose = False 
        )
    return test_arr_rationals
df_test_run = tst_run_code_rational_sampling_set()

************************0************************


************************1************************
************************2************************
Experiment Finished: 0


In [28]:
#tst
df_test_run[ df_test_run['from_seq_id'] == 1]

Unnamed: 0,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt
463,Py,1,"[(Generate, 7.423743954859674e-05)]",[0],[7.423743954859674e-05]
464,ht,1,"[( Py, 0.00010367632057750598), (Generate, 2.5...","[1, 0]","[0.00010367632057750598, 2.5881863621179946e-05]"
465,on,1,"[(ht, 0.003939895424991846), (Generate, 0.0129...","[2, 0, 1]","[0.003939895424991846, 0.012913737446069717, 0..."
466,code,1,"[(on, 8.253633131971583e-05), ( Py, 0.00018459...","[3, 1, 0, 2]","[8.253633131971583e-05, 0.00018459450802765787..."
467,that,1,"[( code, 0.0016732582589611411), (Generate, 0....","[4, 0, 1, 3, 2]","[0.0016732582589611411, 0.018313312903046608, ..."
...,...,...,...,...,...
921,_,1,"[(lower, 0.015630951151251793), (METRICS, 0.31...","[458, 443]","[0.015630951151251793, 0.31624388694763184]"
922,promote,1,"[(_, 1.509848516434431e-06), (promote, 0.00830...","[459, 122, 451]","[1.509848516434431e-06, 0.008305856958031654, ..."
923,(),1,"[(promote, 0.04420287907123566), ((), 0.108644...","[460, 346, 272, 74]","[0.04420287907123566, 0.10864423960447311, 0.1..."
924,\n,1,"[((), 0.3412705957889557)]",[461],[0.3412705957889557]


In [29]:
def run_code_rational_all_set(exp, tensor_n = 100, BATCH = 10): #When Tensor_n and batch differs then 'from_seq_id' is lost
    gc.collect()
    torch.cuda.empty_cache()
    EXP = exp
    test_arr_rationals = run_code_rational( 
            df_generated_input = df_generated_input,
            tensor_size = tensor_n,
            experiment = EXP,
            batch_size = BATCH, 
            model = model, 
            verbose = False 
        )
    #Saving process
    print('Saving process')
    test_arr_rationals.to_csv(param_default()['rational_results'] + '/' + param_default()['dataset'] + '/' + '[t_'+str(tensor_n)+']_[max_tgt_'+str(MAX_TOKEN_SIZE)+']_[exp:' + str(EXP) +'].csv')
    return test_arr_rationals


In [30]:
#tst
#df_test_run = run_code_rational_all_set(exp='0')

In [31]:
for i in df_generated_input.columns[5:]: #Only Generated Sequences 
    df_test_run = run_code_rational_all_set(exp=i, tensor_n=df_generated_input.shape[0])

************************0************************
************************10************************


In [None]:
df_test_run.head(1)

Unnamed: 0,goal_token,from_seq_id,typesets_tgt,rationale_pos_tgt,rationale_prob_tgt
0,skip,0,"[(def, 0.00029721998726017773)]",[0],[0.00029721998726017773]


In [None]:
#Running all Experiments
def exp_run_all_rationales():
    dict_arr_rations = { key : run_code_rational(
        df_generated_input = df_generated_input,
        experiment = key,
        batch_size = 10, 
        model = model, 
        verbose = False 
    ) for key in df_generated_input.columns[5:] }
    return dict_arr_rations

In [None]:
#arr_df_rationale = [pandas_rationales(dict_arr_rations[key]) for key in dict_arr_rations.keys()]