In [14]:
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt

## Token Count and Dataset Processing
This notebook showcases how we obtained the token count of the programs on the original (before cut-off) dataset (from Pan et al) and how we filter-out entries that are above the cut-off

In [15]:
model_list = [
    'WizardLM/WizardCoder-1B-V1.0',
    'WizardLM/WizardCoder-3B-V1.0',
    'WizardLM/WizardCoder-Python-7B-V1.0',
    'WizardLM/WizardCoder-Python-13B-V1.0',
    'WizardLM/WizardCoder-Python-34B-V1.0',
    'ise-uiuc/Magicoder-S-CL-7B',
    'ise-uiuc/Magicoder-CL-7B',
    'mistralai/Mixtral-8x7B-Instruct-v0.1',
    'codellama/CodeLlama-7b-Instruct-hf',
    'codellama/CodeLlama-13b-Instruct-hf',
    'codellama/CodeLlama-34b-Instruct-hf'
]

In [16]:
#This is the original dataset before processing
original_df = pd.read_csv('./original_dataset_before_processing.csv')

In [17]:
original_df.shape

(4000, 6)

In [18]:
original_df.head()

Unnamed: 0,sample_id,input_code,input_lang,desired_output_lang,test_case_input_data,test_case_expected_output
0,58618,#include <iostream>\n#include <cstdio>\n\nusin...,C++,Java,8 10\n,00000000000000000000000000001000\n000000000000...
1,58619,#include <iostream>\n#include <vector>\n\nusin...,C++,Java,3 6\n3 4 5\n,2\n
2,58620,#include <iostream>\n#include<algorithm>\n#inc...,C++,Java,3\n10 2 5\n6 3 4\n,5\n
3,58621,#include<iostream>\n#include<iomanip>\n#includ...,C++,Java,10\n,1\n2\n3\n4\n5\n6\n7\n8\n9\n19\n
4,58622,#include <iostream>\n#include <algorithm>\nusi...,C++,Java,4\nwest\neast\nwait\n,3\n


In [19]:
def keep_samples_below_cutoff(df, models_tokens_samples, cutoff):
    total_models = len(models_tokens_samples)
    fit_samples = []
    
    for i, row in df.iterrows():
        sample_id = row.sample_id
        below_cutoff = 0
        
        for sample_length in models_tokens_samples:
            token_length = sample_length[sample_id]
            if token_length < cutoff:
                below_cutoff+=1
        
        #This sample fits the cutoff for all the models
        if below_cutoff == total_models:
            fit_samples.append(row)
            
    return pd.DataFrame(fit_samples)

In [20]:
def count_tokens_for_model(model, df):
    tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True, use_fast=True, trust_remote_code=True)
    sample_length = {}
    
    for index, row in df.iterrows():
        output = tokenizer(
            row.input_code,
            padding=False,
            add_special_tokens=False,
        )
        
        sample_length[row.sample_id] = len(output.input_ids)
        
    return sample_length

In [21]:
def count_tokens_for_all_models(df, model_list):
    models_results = []
    
    for model in model_list:
        program_token_length_dict = count_tokens_for_model(model, df)
        models_results.append(program_token_length_dict)
        
    return models_results
    

In [37]:
def print_distribution(data, bin_size):

    df = pd.DataFrame(data, columns=['Values'])

    # Calculate bin edges with lower limit inclusive and higher limit exclusive
    min_value = 0
    max_value = max(df['Values']) + bin_size
    bin_edges = range(min_value, max_value, bin_size)
    bin_counts, _ = pd.cut(df['Values'], bins=bin_edges, include_lowest=True, right=False, retbins=False).value_counts().sort_index().values, bin_edges
    total_count = sum(bin_counts)

    # Calculate the percentage of each bin
    percentages = [(count / total_count) * 100 for count in bin_counts]

    # Display the distribution
    print("Bin Edges\tFrequency\tPercentage")
    for i in range(len(bin_edges) - 1):
        print(f"{bin_edges[i]}-{bin_edges[i+1]-1}\t{bin_counts[i]}\t\t{percentages[i]:.2f}%")


### Process the original dataset 

In [23]:
models_results = count_tokens_for_all_models(original_df, model_list)

Token indices sequence length is longer than the specified maximum sequence length for this model (9422 > 8192). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9422 > 8192). Running this sequence through the model will result in indexing errors
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Examine the distribution before-cutoff

In [24]:
all_data_points = [value for samples_length in models_results for value in samples_length.values()]
print(len(all_data_points))

44000


In [25]:
bin_size = 1024
print_distribution(all_data_points, bin_size)

Bin Edges	Frequency	Percentage
0-1023	40248		91.47%
1024-2047	2148		4.88%
2048-3071	676		1.54%
3072-4095	504		1.15%
4096-5119	132		0.30%
5120-6143	72		0.16%
6144-7167	16		0.04%
7168-8191	80		0.18%
8192-9215	36		0.08%
9216-10239	44		0.10%
10240-11263	0		0.00%
11264-12287	0		0.00%
12288-13311	4		0.01%
13312-14335	40		0.09%


In [26]:
cutoff = 3072
df = keep_samples_below_cutoff(original_df, models_results, cutoff)

In [27]:
df.shape

(3912, 6)

In [28]:
df.head()

Unnamed: 0,sample_id,input_code,input_lang,desired_output_lang,test_case_input_data,test_case_expected_output
0,58618,#include <iostream>\n#include <cstdio>\n\nusin...,C++,Java,8 10\n,00000000000000000000000000001000\n000000000000...
1,58619,#include <iostream>\n#include <vector>\n\nusin...,C++,Java,3 6\n3 4 5\n,2\n
2,58620,#include <iostream>\n#include<algorithm>\n#inc...,C++,Java,3\n10 2 5\n6 3 4\n,5\n
3,58621,#include<iostream>\n#include<iomanip>\n#includ...,C++,Java,10\n,1\n2\n3\n4\n5\n6\n7\n8\n9\n19\n
4,58622,#include <iostream>\n#include <algorithm>\nusi...,C++,Java,4\nwest\neast\nwait\n,3\n


After the cut-off there are 3,912 samples. Then, we balance the dataset by random sampling 191 samples per input-output language combination


### Balance the dataset

In [29]:
def sample_group(group):
    return group.sample(min(len(group), 191), replace=False, random_state=1)

In [30]:
sampled_df = df.groupby(['input_lang', 'desired_output_lang']).apply(sample_group).reset_index(drop=True)

In [31]:
sampled_df.shape

(3820, 6)

> **Note**: Please note that ```sampled_df``` above does not use the same random seed that we used in the study. Therefore, the distribution of ```sampled_df``` could be different from ```dataset_after_processing.csv```. The provided balanced dataset ```dataset_after_processing.csv``` should be used as input when replicating next sections for consistency, and not the generated dataset above. Thank you.

### Explore the distribution after the dataset is balanced

In [32]:
#This is used for reference
reference_df = pd.read_csv('./dataset_after_processing.csv')

In [33]:
reference_df.shape

(3820, 6)

In [34]:
reference_models_results = count_tokens_for_all_models(reference_df, model_list)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [35]:
reference_data_points = [value for samples_length in reference_models_results for value in samples_length.values()]
print(len(reference_data_points))

42020


In [38]:
bin_size = 128
print_distribution(reference_data_points, bin_size)

Bin Edges	Frequency	Percentage
0-127	13603		32.37%
128-255	11527		27.43%
256-383	5589		13.30%
384-511	3288		7.82%
512-639	1979		4.71%
640-767	1537		3.66%
768-895	1138		2.71%
896-1023	627		1.49%
1024-1151	547		1.30%
1152-1279	282		0.67%
1280-1407	572		1.36%
1408-1535	164		0.39%
1536-1663	212		0.50%
1664-1791	56		0.13%
1792-1919	176		0.42%
1920-2047	111		0.26%
2048-2175	88		0.21%
2176-2303	185		0.44%
2304-2431	48		0.11%
2432-2559	136		0.32%
2560-2687	47		0.11%
2688-2815	72		0.17%
2816-2943	36		0.09%
