In [1]:
# !pip install gdown

In [1]:
import os
import SATD_tool
import importlib
import gzip
importlib.reload(SATD_tool)
from SATD_tool import *

# Download Datasets

In [2]:
import gdown

def download_from_google_drive(fileId, filePath):
    download_url = f"https://drive.google.com/uc?id={fileId}"
    gdown.download(download_url, filePath, quiet=False)  

In [3]:
# Download Python dataset

os.makedirs("Datasets/Python", exist_ok=True)

fileId_filePath = {
    '1aQ5Rpe-GI-Vqb5BCg8cz8bXXEKzKmguX': 'Datasets/Python/df3.pkl.gz',
    '1obp8vk7wBSguKy72UPPCLHOcVRTGYGuP': 'Datasets/Python/df3_train.pkl.gz',
    '1ItMGQfbZHMbnvhljbJqr2dP4WFLbjQky': 'Datasets/Python/df3_test.pkl.gz',
    '1ePmAuajVjYiKfNKLPhNsCIp9XK0oTx-Q': 'Datasets/Python/df3_dev.pkl.gz',
}

for fileId, filePath in fileId_filePath.items():
    if not os.path.exists(filePath):
        download_from_google_drive(fileId, filePath)
    else:
        print(filePath, 'already exists.')

Datasets/Python/df3.pkl.gz already exists.
Datasets/Python/df3_train.pkl.gz already exists.
Datasets/Python/df3_test.pkl.gz already exists.
Datasets/Python/df3_dev.pkl.gz already exists.


In [4]:
# Download Java dataset

os.makedirs("Datasets/Java", exist_ok=True)

fileId_filePath = {
    '1t5Pf0f8NSygdNBgTxtdsPPmvGbmFlbFe': 'Datasets/Java/df3.pkl.gz',
    '16K6kc_9_sOsFDw9z2sOp-33AE1BREe2P': 'Datasets/Java/df3_train.pkl.gz',
    '1ecb3uLQ-DeT1FgI0p8Yxk6SCfqluTkLI': 'Datasets/Java/df3_test.pkl.gz',
    '1KJq4RNupQW8GH9wuE48tbZtHOCCbFGrv': 'Datasets/Java/df3_dev.pkl.gz',
}

for fileId, filePath in fileId_filePath.items():
    if not os.path.exists(filePath):
        download_from_google_drive(fileId, filePath)
    else:
        print(filePath, 'already exists.')        

Datasets/Java/df3.pkl.gz already exists.
Datasets/Java/df3_train.pkl.gz already exists.
Datasets/Java/df3_test.pkl.gz already exists.
Datasets/Java/df3_dev.pkl.gz already exists.


In [5]:
# Download Mastropaolo dataset

os.makedirs("Datasets/Mastropaolo", exist_ok=True)

fileId_filePath = {
    '1PgeYVvyk1yu0s8AxEcFrK5CjBlUPvKAq': 'Datasets/Mastropaolo/mastropaolo_with_filter_columns.pkl.gz',
}

for fileId, filePath in fileId_filePath.items():
    if not os.path.exists(filePath):
        download_from_google_drive(fileId, filePath)
    else:
        print(filePath, 'already exists.')    

Datasets/Mastropaolo/mastropaolo_with_filter_columns.pkl.gz already exists.


# Load Dataset

In [13]:
# RQ2: uses the first 1000 items in the dataset for evaluating models by prompt engineering.
# Note that each item in the dataset has a randomly assinged index (rand_index). As the dataset is now sorted by the rand_index,
# taking the first 1000 rows means that we randomly select 1000 items from the dataset.

# RQ3: uses the Mastropaolo dataset

# RQ4: uses the whole dataset which is repository-based splitted to train, dev, and test.

RQ = 'RQ2' # RQ2 or RQ3 or RQ4 
LANGUAGE = 'Java' # Python or Java (no need to set for RQ3)

if RQ=='RQ2':
    DATASET = LANGUAGE
elif RQ=='RQ3':
    LANGUAGE = 'Java'
    DATASET = 'Mastropaolo'
elif RQ=='RQ4':
    DATASET = LANGUAGE + '_test'
else:
    raise ValueError("Invalid value for RQ:", RQ)

print(f"DATASET: {DATASET}, LANGUAGE:{LANGUAGE}")

DATASET: Java, LANGUAGE:Java


In [14]:
import pandas as pd
DATASETS_DIR = 'Datasets'

if RQ=='RQ2':
    df = pd.read_pickle(gzip.open(f'{DATASETS_DIR}/{LANGUAGE}/df3.pkl.gz', 'rb'))
    df = df[df['is_repayment_llama3'] == 'yes'] # we need to apply the last filter
    df = df.head(1000)
elif RQ=='RQ3':
    df = pd.read_pickle(gzip.open(f'{DATASETS_DIR}/Mastropaolo/mastropaolo_with_filter_columns.pkl.gz', 'rb'))
    df = df[df['data_split']=='test']
    # apply our two new filters
    df = df[(df["SATD_count_before_repayment"] == 1) & (df["SATD_count_after_repayment"] == 0)]
    df = df[df['is_repayment_llama3']=='yes']   
elif RQ=='RQ4':
    df = pd.read_pickle(gzip.open(f'{DATASETS_DIR}/{LANGUAGE}/df3_test.pkl.gz', 'rb'))
    df_train = pd.read_pickle(f'/home/jovyan/SATD-Repayment/Create Dataset/Datasets/{LANGUAGE}/df3_train.pkl')
print(df.shape)
df.columns

(1000, 26)


Index(['rand_index', 'user', 'project', 'created_in_file',
       'last_appeared_in_file', 'created_in_line', 'last_appeared_in_line',
       'created_in_commit', 'deleted_in_commit', 'created_at_date',
       'deleted_at_date', 'content', 'deleted_in_lines', 'created_in_lines',
       'updated_in_commits', 'last_content', 'SATD_comment',
       'containing_method_applied_approach',
       'containing_method_before_repayment',
       'containing_method_after_repayment', 'method_is_updated',
       'SATD_count_before_repayment', 'SATD_count_after_repayment',
       'method_tokens_before_repayment', 'method_tokens_after_repayment',
       'is_repayment_llama3'],
      dtype='object')

In [15]:
# add deletion_commit_url
def get_deletion_commit_url(row):
    return f"""www.github.com/{row['user']}/{row['project']}/commit/{row['deleted_in_commit']}"""

if RQ=='RQ2':
    df['deletion_commit_url'] = df.apply(get_deletion_commit_url, axis=1)

In [16]:
def get_number_of_deleted_and_inserted_lines(row):
    deleted_lines, inserted_lines = get_deleted_and_inserted_lines(row['containing_method_before_repayment'], row['containing_method_after_repayment'])
    return len(deleted_lines), len(inserted_lines)

df[['number_of_deleted_lines', 'number_of_inserted_lines']] = df.apply(
    lambda row: pd.Series(get_number_of_deleted_and_inserted_lines(row)), axis=1
)

i=2
count = len(df[df['number_of_inserted_lines']<=i])
print(f'Number of items that {i} or less new lines are added or updated (Easy group): {count} (%{100*count/len(df):.1f})')
count = len(df[df['number_of_inserted_lines']>i])
print(f'Number of items that {i+1} or more new lines are added or updated (Hard group): {count} (%{100*count/len(df):.1f})')

Number of items that 2 or less new lines are added or updated (Easy group): 492 (%49.2)
Number of items that 3 or more new lines are added or updated (Hard group): 508 (%50.8)


**Add the prompt column to the dataframe**

In [17]:
PROMPT_TEMPLATE = 'NoExplain' # 'Mastropaolo-T2' or 'NoExplain' or 'CoT1' or 'CoT2'

def get_prompt(row):
    
    if PROMPT_TEMPLATE == 'Mastropaolo-T2':
        prompt = f"""Perform removal of this SATD: "{row['SATD_comment']}" from this code:

{row['containing_method_before_repayment']}
"""
    
    if PROMPT_TEMPLATE == 'NoExplain':
        prompt = f"""How to update the following code to resolve the SATD? No need to explain. Just provide the updated code.

### Code:
{row['containing_method_before_repayment']}

### SATD comment:
{row['SATD_comment']}

### Updated code after SATD repayment:"""

    if PROMPT_TEMPLATE == 'CoT1':
        prompt = f"""How to update the following code to resolve the SATD?

### Code:
{row['containing_method_before_repayment']}

### SATD comment:
{row['SATD_comment']}

### Consider the following questions in your answer:
Shortly explain how to resolve the SATD.
Provide the updated code."""
    
    if PROMPT_TEMPLATE == 'CoT2':
        prompt = f"""How can the following code be updated to resolve the Self-Admitted Technical Debt (SATD)?

### Code:
{row['containing_method_before_repayment']}

### SATD comment:
{row['SATD_comment']}

### Consider the following questions in your answer:
1. Briefly explain how to resolve the SATD.
2. Provide the updated code."""
    
    return prompt


df['prompt2'] = df.apply(get_prompt, axis=1)
print(f'PROMPT_TEMPLATE: {PROMPT_TEMPLATE}\n')
print(df.iloc[0]['prompt2'])

PROMPT_TEMPLATE: NoExplain

How to update the following code to resolve the SATD? No need to explain. Just provide the updated code.

### Code:
public Connection readConnection(URI connectionURI) throws NoSuchConnectionException {
    logger.debug(MessageFormat.format("need-facing: READ_CONNECTION called for connection {0}", connectionURI));

    //TODO: Convert readConnectionContent(connectionURI)
    return null;
}

### SATD comment:
TODO: Convert readConnectionContent(connectionURI)

### Updated code after SATD repayment:


# Use OpenAI - GPT4o-mini

This part is to reproduce the generated outputs by GPT4o-mini. You can pass to the next section if you don't want to reproduce them.

In [None]:
from openai import OpenAI
client = OpenAI(api_key="xxx") # provide your OpenAI API key

# test it
response = client.chat.completions.create(
  model="gpt-4o-mini",
  temperature=0.0,
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"}
  ]
)
response.choices[0].message.content

In [19]:
import json
def create_batch_file(df, start_row, end_row, prompt_template, openai_model, dataset):
    filename = dataset + '_' + prompt_template + '_' + openai_model + '_batch' + str(start_row) + 'to' + str(end_row) + '.jsonl'
    with open(filename, 'w') as f:
        for i in range(start_row, end_row):
            batch = {}
            batch['custom_id'] = str(df.iloc[i]['rand_index'])
            batch['method'] = 'POST'
            batch['url'] = '/v1/chat/completions'
            body = {"model": openai_model, 'temperature':0.0, "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": df.iloc[i]['prompt2']}],"max_tokens": 2048}
            batch['body'] = body
            f.write(json.dumps(batch) + '\n')
    return filename

In [20]:
# create batch file
openai_model = "gpt-4o-mini" # gpt-4o-mini (current: gpt-4o-mini-2024-07-18)
filename = create_batch_file(df, 0, len(df), PROMPT_TEMPLATE, openai_model, DATASET)
print('Batch file saved in', filename)

Batch file saved in Java_test_Mastropaolo-T2_gpt-4o-mini_batch0to9013.jsonl


In [None]:
# Upload the file to OpenAI
batch_input_file = client.files.create(
  file=open(filename, "rb"),
  purpose="batch"
)

In [None]:
# create the batch (it starts the process)
# NOTE: make sure you don't run it twice, or you will be charged twice

batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": filename.replace('.jsonl','')  # NOTE: it uses the file name as description
    }
)

In [None]:
# Cancelling a batch
# client.batches.cancel("batch_abc123")

In [None]:
# Getting a list of all batches
client.batches.list(limit=1)

In [None]:
# Check the status of a batch
client.batches.retrieve("batch_676c3b98c7a0819085cf8a89ed7fdc67")

In [None]:
# Retrieving the batch results:
file_response = client.files.content("file-EqjRGjeDK7GZJ236nDbgdq")

In [None]:
# convert the output to randIndex_answer dictionary
randIndex_answer = {}
for line in file_response.text.splitlines():
    item = json.loads(line)
    randIndex = int(item['custom_id'])
    answer = item['response']['body']['choices'][0]['message']['content']
    randIndex_answer[randIndex] = answer
print(len(randIndex_answer))
print(randIndex_answer[4130])

In [None]:
# save the OpenAI answers
save_to_json(randIndex_answer, f'answers_test_{PROMPT_TEMPLATE}.json')    

# Use Open LLMs

This part is to reproduce the generated outputs by open LLMs (You can pass to the next section if you don't want to reproduce them). Before running the next cells in this section, you need to run the model in the vLLM using one of the following commands:

```
# you may need to run "unset VLLM_ATTENTION_BACKEND"
python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
    --port 8000  \
    --gpu-memory-utilization 0.95 \
    --max-model-len 4096 \
    --download-dir /tmp/vllm_models \
    --api-key 123

# you may need to run "export VLLM_ATTENTION_BACKEND=FLASHINFER" first.
python -m vllm.entrypoints.openai.api_server \
    --model google/gemma-2-9b-it \
    --port 8000  \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --download-dir /tmp/vllm_models \
    --api-key 123

python -m vllm.entrypoints.openai.api_server \
    --model deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
    --port 8000  \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --download-dir /tmp/vllm_models \
    --trust-remote-code \
    --api-key 123

python -m vllm.entrypoints.openai.api_server \
    --model meta-llama/Meta-Llama-3.1-70B-Instruct \
    --port 8000  \
    --gpu-memory-utilization 0.90 \
    --max-model-len 4096 \
    --download-dir /tmp/vllm_models \
    --tensor-parallel-size 4 \
    --api-key 123
```

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "123"

MODEL = 'Llama3.1-8B'
MODEL_FullName = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

# MODEL = 'Llama3.1-70B'
# MODEL_FullName = 'meta-llama/Meta-Llama-3.1-70B-Instruct'

# MODEL = 'Gemma-2-9B'
# MODEL_FullName = 'google/gemma-2-9b-it'

# MODEL = 'DeepSeek-Coder-V2-Lite-Instruct'
# MODEL_FullName = 'deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct'

In [None]:
# test vllm to make sure it works
from openai import OpenAI

client = OpenAI(
    api_key="123",
    base_url="http://localhost:8000/v1",
)

prompts = ['300+30=','why is the sky blue?']

# correct the format of prompts
if 'llama3' in MODEL.lower():
    prompts = [f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" for prompt in prompts]
elif 'deepseek' in MODEL.lower():
    prompts = [f"<｜begin▁of▁sentence｜>User: {prompt}\n\nAssistant:" for prompt in prompts]
elif 'openchat' in MODEL.lower() or 'starling' in MODEL.lower():
    prompts = [f"<s>GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:" for prompt in prompts]
elif 'gemma' in MODEL.lower():
    prompts = [f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n" for prompt in prompts]
elif 'qwen' in MODEL.lower():
    prompts = [f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" for prompt in prompts]
elif 'mistral-nemo' in MODEL.lower():
    prompts = [f"<s>[INST]{prompt}[/INST]" for prompt in prompts]
#     prompts = [f"<s>[INST]You are a helpful assistant.\n\n{prompt}[/INST]" for prompt in prompts]
elif 'phi' in MODEL.lower():
    prompts = [f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n" for prompt in prompts]
#     prompts = [f"<|system|>\nYou are a helpful assistant.<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n" for prompt in prompts]
else:
    raise ValueError('Model not supported')    

completion = client.completions.create(model=MODEL_FullName, prompt=prompts, max_tokens=2048, temperature=0.0, stop=["<|eot_id|>"]) # we may need to set temperature to 0.01 rather 0.0
# completion = client.completions.create(model=MODEL_FullName, prompt=prompts, max_tokens=2048, top_p=1, stop=["<|eot_id|>"]) # it doesn't generate deterministic output

for text in completion.choices:
    print(text.text)
    print('------------------------')

In [None]:
OUTPUT_DIR = 'generated_outputs_new'

In [None]:
# Run the open model on the dataset. It splits the data to batches of size 100 and send them to the running model in vLLM.

from openai import OpenAI
import time

client = OpenAI(
    api_key="123",
    base_url="http://localhost:8000/v1",
)

batch_size = 100
batch_prompts = []
batch_randIndex = []

for i in range(0, len(df), batch_size):
    batch_prompts.append(df.iloc[i:i + batch_size]['prompt2'].tolist())
    batch_randIndex.append(df.iloc[i:i + batch_size]['rand_index'].tolist())
print('Total number of batches:', len(batch_prompts))

if 'openchat' in MODEL.lower() or 'starling' in MODEL.lower():
    batch_prompts = [[f"<s>GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:" for prompt in prompts] for prompts in batch_prompts]
elif 'deepseek' in MODEL.lower():
    batch_prompts = [[f"<｜begin▁of▁sentence｜>User: {prompt}\n\nAssistant:" for prompt in prompts] for prompts in batch_prompts]
elif 'llama3' in MODEL.lower():
    batch_prompts = [[f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" for prompt in prompts] for prompts in batch_prompts]
elif 'gemma' in MODEL.lower():
    batch_prompts = [[f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n" for prompt in prompts] for prompts in batch_prompts]
elif 'qwen' in MODEL.lower():
    batch_prompts = [[f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" for prompt in prompts] for prompts in batch_prompts]
elif 'mistral-nemo' in MODEL.lower():
    batch_prompts = [[f"<s>[INST]{prompt}[/INST]" for prompt in prompts] for prompts in batch_prompts]
    # batch_prompts = [[f"<s>[INST]You are a helpful assistant.\n\n{prompt}[/INST]" for prompt in prompts] for prompts in batch_prompts]
elif 'phi' in MODEL.lower():
    prompts = [f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n" for prompt in prompts]
#     prompts = [f"<|system|>\nYou are a helpful assistant.<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>\n" for prompt in prompts]
else:
    raise ValueError('Model not supported')

start_time=time.time()
randIndex_answer = {}
unsaved_randIndex_answer = {}
save_size = 10000
for randIndex,prompts in zip(batch_randIndex,batch_prompts):
    try:
        print('Getting answers for the items',randIndex[0],'to',randIndex[-1]) # NOTE: set temperature to 0.0
        try:
            completion = client.completions.create(model=MODEL_FullName, prompt=prompts, max_tokens=2048, temperature=0.0, stop=["<|eot_id|>"])
        except Exception as e:
            print('We failed with max_tokens=2048. Try with max_tokens=1024 ...')
            completion = client.completions.create(model=MODEL_FullName, prompt=prompts, max_tokens=1024, temperature=0.0, stop=["<|eot_id|>"])
        for rindx,text in zip(randIndex,completion.choices):
            randIndex_answer[rindx] = text.text
            unsaved_randIndex_answer[rindx] = text.text
        if len(unsaved_randIndex_answer)>=save_size:
            if 'test' in DATASET:
                filepath = OUTPUT_DIR + '/' + DATASET + '/' + MODEL + '/answers_test_' + PROMPT_TEMPLATE + '.json'
            else:
                filepath = OUTPUT_DIR + '/' + DATASET + '/' + MODEL + '/answers_'+str(list(unsaved_randIndex_answer.keys())[0]) + 'to' + str(list(unsaved_randIndex_answer.keys())[-1]) + '_' + PROMPT_TEMPLATE + '.json'
            print('Saving items in', filepath)
            save_to_json(unsaved_randIndex_answer, filepath)
            unsaved_randIndex_answer = {}
    except Exception as e:
        print("An error occurred:", e)

# save the last part, if it is not the toy df
if len(df)>500 and len(unsaved_randIndex_answer)>0:
    if 'test' in DATASET:
        filepath = OUTPUT_DIR + '/' + DATASET + '/' + MODEL + '/answers_test_' + PROMPT_TEMPLATE + '.json'
    else:
        filepath = OUTPUT_DIR + '/' + DATASET + '/' + MODEL + '/answers_'+str(list(unsaved_randIndex_answer.keys())[0]) + 'to' + str(list(unsaved_randIndex_answer.keys())[-1]) + '_' + PROMPT_TEMPLATE + '.json'
    print('Saving items in', filepath)
    save_to_json(unsaved_randIndex_answer, filepath)
    unsaved_randIndex_answer = {}
print('elapsed time:', int(time.time()-start_time), 'seconds')

# Determine the models and prompts for each dataset

In [18]:
# Determine the models and prompts for each dataset to find the corresponding filename in the output directory

dataset_testpart = {'Python_test':'test', 'Java_test':'test', 'Mastropaolo':'test', 'Python':'10to2157', 'Java':'0to2003'}

dataset_models = {'Mastropaolo':['gpt-4o-mini-2024-07-18', 'codeT5p-770m']} # RQ3
for dataset in ['Java','Python']: # RQ2
    dataset_models[dataset] = ['Llama3.1-8B', 'Gemma-2-9B', 'DeepSeek-Coder-V2-Lite-Instruct', 'Llama3.1-70B', 'gpt-4o-mini-2024-07-18']
for dataset in ['Java_test','Python_test']: # RQ4
    dataset_models[dataset] = ['Llama3.1-8B', 'Gemma-2-9B', 'DeepSeek-Coder-V2-Lite-Instruct', 'Llama3.1-70B', 'gpt-4o-mini-2024-07-18', 'codeT5p-220m-py', 'codeT5p-770m-py', 'codeT5p-220m', 'codeT5p-770m']
    
dataset_prompts = {'Mastropaolo':['NoExplain','TrainByControlled_TestByFiltered','TrainByFiltered_TestByFiltered']} # RQ3
for dataset in ['Java','Python']: # RQ2
    dataset_prompts[dataset] = ['Mastropaolo-T2', 'NoExplain', 'CoT1', 'CoT2']
for dataset in ['Java_test','Python_test']: # RQ4
    dataset_prompts[dataset] = ['Mastropaolo-T2', 'NoExplain', 'CoT1', 'CoT2', 'allTrain_lr1e-5', 'allTrain_lr5e-6'] 

# Check output and extract code from the generated answer

In [19]:
# get a few rand_index for manual checking
df.head(10)['rand_index'].tolist()

[0, 3, 4, 8, 11, 13, 15, 16, 17, 22]

In [20]:
# load answers for check
if RQ=='RQ2' or RQ=='RQ4':
    randIndex_answer = load_from_json(f'generated_outputs/{DATASET}/DeepSeek-Coder-V2-Lite-Instruct/answers_{dataset_testpart[DATASET]}_CoT1.json')
    # Convert keys to integers
    randIndex_answer = {int(key): value for key, value in randIndex_answer.items()}
    print(len(randIndex_answer))

1000


In [22]:
# check one answer
indx = 22
print(df[df['rand_index']==indx]['deletion_commit_url'].tolist()[0])
print(df[df['rand_index']==indx]['last_appeared_in_file'].tolist()[0])
print('line:', df[df['rand_index']==indx]['last_appeared_in_line'].tolist()[0])
print('\n', '-'*15, 'containing_method_before_repayment', '-'*15, '\n')
print(df[df['rand_index']==indx]['containing_method_before_repayment'].tolist()[0])
print('\n', '-'*15, 'containing_method_after_repayment', '-'*15, '\n')
print(df[df['rand_index']==indx]['containing_method_after_repayment'].tolist()[0])
print('\n', '-'*15, 'generated answer', '-'*15, '\n')
print(randIndex_answer[indx])

www.github.com/franzinc/agraph-java-client/commit/3580ee15ac8ddb5c935642314525162de55b5592
src/com/franz/agraph/jena/AGInfGraph.java
line: 102

 --------------- containing_method_before_repayment --------------- 

public ValidityReport validate() {
	// TODO Auto-generated method stub
	return null;
}

 --------------- containing_method_after_repayment --------------- 

public ValidityReport validate() {
	throw new UnsupportedOperationException(AGUnsupportedOperation.message);
}

 --------------- generated answer --------------- 

 The Self-Applicable Technical Debt (SATD) comment "TODO Auto-generated method stub" indicates that the code contains a placeholder for a method implementation that should be replaced with actual logic. This is a common issue in codebases where developers might leave placeholders for methods they haven't yet implemented.

To resolve this SATD, you should replace the placeholder with the actual logic that performs the validation. Here’s how you can update the co

In [23]:
# check extracting code from answers
for i in df.head(1)['rand_index']:
    print(i)
    print(randIndex_answer[i])
    print('-'*30)
    print(extract_code(randIndex_answer[i], LANGUAGE))
    print('='*50)

0
 To resolve the SATD (Sticky Assignee Task Description), you need to replace the `TODO` comment with the actual implementation of the `readConnectionContent(connectionURI)` method. This method is currently a placeholder (`return null;`).

Here's the updated code:

### Updated Code:
```java
public Connection readConnection(URI connectionURI) throws NoSuchConnectionException {
    logger.debug(MessageFormat.format("need-facing: READ_CONNECTION called for connection {0}", connectionURI));

    // Implement readConnectionContent(connectionURI)
    return readConnectionContent(connectionURI);
}

// Assuming readConnectionContent(URI connectionURI) is defined elsewhere in the class
private Connection readConnectionContent(URI connectionURI) throws NoSuchConnectionException {
    // Implementation of reading the connection content
    // This is a placeholder implementation
    // You need to replace this with the actual logic
    throw new NoSuchConnectionException("Not implemented");
}
``

# Check how remove ICD (imports, comments, and docstings) works

In [24]:
# Example usage for remove_docstrings_and_comments_from_python

python_code = """
import os
from os import xyz
# This is a sample Python code
#
def example_function():
    '''This is docstring'''
    x = 10  # Initialize x with 10
    y = 20  # Initialize y with 20
    a = "#Hello#"  # Initialize a with "#Hello#"
    print("###Hello###")
    # The following line does the sum
    return x + y  # Return the sum
"""

clean_code = remove_docstrings_and_comments_from_python(remove_imports(python_code))
print(clean_code)

def example_function():
    x = 10
    y = 20
    a = '#Hello#'
    print('###Hello###')
    return x + y


In [25]:
# Example usage for remove_comments_and_javadoc_from_java
java_code = """
public class Example {
    /**
     * This is a Javadoc comment
     */
    public void method() {
        // This is a single-line comment
        int x = 1; /* This is a multi-line comment */
        int y = 2; // This is another // single-line comment
    }
}
"""

cleaned_code = remove_comments_and_javadoc_from_java(java_code)
print(cleaned_code)


public class Example {
    public void method() {
        int x = 1; 
        int y = 2; 
    }
}



# Get Exact Match Scores

In [26]:
OUTPUT_DIR = 'generated_outputs'

In [27]:
model_template_em = {}
model_randIndex_answer = {}
data = []
for model_name in dataset_models[DATASET]: 
    model_template_em[model_name] = {}
    for prompt_template in dataset_prompts[DATASET]:
        file_path = f'{OUTPUT_DIR}/{DATASET}/{model_name}/answers_{dataset_testpart[DATASET]}_{prompt_template}.json'
        if os.path.exists(file_path)==False:
            continue
        randIndex_answer = load_from_json(file_path)
        randIndex_answer = {int(key): value for key, value in randIndex_answer.items()}
        randIndex_answer = {k: v for k, v in randIndex_answer.items() if k in set(df['rand_index'])} # filter out items not exist in df
        model_randIndex_answer[model_name] = randIndex_answer
        if 'codet5' in model_name.lower():
            extract_code_from_answer=False
        else:
            extract_code_from_answer=True            
        print(model_name, 'with', prompt_template)
        randIndex_distance, em, black_failed, black_failed_em = get_exact_matches(df, randIndex_answer, LANGUAGE, DATASET, extract_code_from_answer, ignore_whitespace=False, format_code=False)
        print('  Exact Match without any changes:',len(em))
        randIndex_distance, em, black_failed, black_failed_em = get_exact_matches(df, randIndex_answer, LANGUAGE, DATASET, extract_code_from_answer, ignore_whitespace=True)
        print('  Exact Match by ignoring whitespaces:',len(em))
        if False:
            print('  Considering imports, code comments, and docstrings:')
            randIndex_distance, em, black_failed, black_failed_em = get_exact_matches(df, randIndex_answer, LANGUAGE, DATASET, extract_code_from_answer)
            print('    Exact Match (all):',len(em), list(em.keys()))
            print('    Failed by black:',len(black_failed))
            print('    Failed by black but is exact match by ignoring whitespaces:',len(black_failed_em), list(black_failed_em.keys()))
        print('  Ignoring imports, code comments, and docstrings:')
        randIndex_distance, em, black_failed, black_failed_em = get_exact_matches(df, randIndex_answer, LANGUAGE, DATASET, extract_code_from_answer, ignore_docstrings_and_comments=True)
        model_template_em[model_name][prompt_template] = set(em.keys())
        print('    Exact Match (all):',len(em)) # , list(em.keys()))
        if LANGUAGE=='Python':
            print('    Failed by black:',len(black_failed))
            print('    Failed by black but is exact match by ignoring whitespaces:',len(black_failed_em), list(black_failed_em.keys()))
        print('    Difficulty group:')
        items_easy = get_itmes_having_specific_number_of_inserted_lines(df,em.keys(),0,2)
        print('      Items with at most two added lines in ground truth:', len(items_easy))
        items_hard = get_itmes_having_specific_number_of_inserted_lines(df,em.keys(),3,1000)
        print('      Items with three or more added lines in ground truth:', len(items_hard))
        avg_leven_dist = sum(randIndex_distance.values()) / len(randIndex_distance)
        print('Average of Levenshtein Distance:', avg_leven_dist)
        # get the average number of deleted and inserted lines
        deletes = []
        inserts = []
        for indx, answer in randIndex_answer.items():
            if extract_code_from_answer:
                answercode = extract_code(answer,LANGUAGE)
            else:
                answercode = answer
            inputcode = df[df['rand_index']==indx]['containing_method_before_repayment'].tolist()[0]
            deleted,inserted = get_deleted_and_inserted_lines(answercode,inputcode)
            deletes.append(len(deleted))
            inserts.append(len(inserted))
        avg_deletes = sum(deletes)/len(deletes)
        avg_inserts = sum(inserts)/len(inserts)
        print('-'*50)
        data.append([model_name, prompt_template, avg_deletes, avg_inserts, len(em)/len(randIndex_answer), len(items_easy)/len(randIndex_answer), len(items_hard)/len(randIndex_answer), avg_leven_dist])
    print('='*50)
    
# convert data to df
df_EM = pd.DataFrame(data, columns=['Model', 'Template', 'Avg deletes', 'Avg inserts', 'EM', 'EM on Easy', 'EM on Hard', 'Avg Levenshtein Distance'])
display(df_EM)

Llama3.1-8B with Mastropaolo-T2
  Exact Match without any changes: 20
  Exact Match by ignoring whitespaces: 30
  Ignoring imports, code comments, and docstrings:
    Exact Match (all): 36
    Difficulty group:
      Items with at most two added lines in ground truth: 33
      Items with three or more added lines in ground truth: 3
Average of Levenshtein Distance: 198.639
--------------------------------------------------
Llama3.1-8B with NoExplain
  Exact Match without any changes: 28
  Exact Match by ignoring whitespaces: 39
  Ignoring imports, code comments, and docstrings:
    Exact Match (all): 57
    Difficulty group:
      Items with at most two added lines in ground truth: 54
      Items with three or more added lines in ground truth: 3
Average of Levenshtein Distance: 220.384
--------------------------------------------------
Llama3.1-8B with CoT1
  Exact Match without any changes: 22
  Exact Match by ignoring whitespaces: 30
  Ignoring imports, code comments, and docstrings:


Unnamed: 0,Model,Template,Avg deletes,Avg inserts,EM,EM on Easy,EM on Hard,Avg Levenshtein Distance
0,Llama3.1-8B,Mastropaolo-T2,2.161,2.461,0.036,0.033,0.003,198.639
1,Llama3.1-8B,NoExplain,4.391,3.538,0.057,0.054,0.003,220.384
2,Llama3.1-8B,CoT1,7.317,5.468,0.054,0.051,0.003,283.399
3,Llama3.1-8B,CoT2,8.682,5.091,0.053,0.048,0.005,292.799
4,Gemma-2-9B,Mastropaolo-T2,1.795,2.606,0.058,0.055,0.003,190.973
5,Gemma-2-9B,NoExplain,2.718,4.145,0.088,0.083,0.005,209.015
6,Gemma-2-9B,CoT1,7.638,8.328,0.058,0.053,0.005,335.632
7,Gemma-2-9B,CoT2,7.398,7.895,0.058,0.053,0.005,318.087
8,DeepSeek-Coder-V2-Lite-Instruct,Mastropaolo-T2,1.322,2.455,0.037,0.034,0.003,197.232
9,DeepSeek-Coder-V2-Lite-Instruct,NoExplain,4.009,3.697,0.062,0.056,0.006,205.433


In [28]:
# Optimal Performance with an Oracle Template (Discussion 6.1)
model_oracle = {}
if RQ=='RQ2':
    for model_name in dataset_models[DATASET]:
        model_oracle[model_name] = set()
        for prompt_template in dataset_prompts[DATASET]:
            model_oracle[model_name] = model_oracle[model_name] | model_template_em[model_name][prompt_template]
    print({model:len(oracle)/len(df) for model,oracle in model_oracle.items()})

{'Llama3.1-8B': 0.1, 'Gemma-2-9B': 0.106, 'DeepSeek-Coder-V2-Lite-Instruct': 0.093, 'Llama3.1-70B': 0.111, 'gpt-4o-mini-2024-07-18': 0.106}


# BLEU and CrystalBLEU

In [29]:
from collections import Counter
from nltk.util import ngrams

k=500

tokenized_corpus = [line.split() for line in df['containing_method_after_repayment'].tolist()]
# convert list_of_lists to list
tokenized_corpus = [item for sublist in tokenized_corpus for item in sublist]
all_ngrams = []
for n in range(1, 5):
    all_ngrams.extend(list(ngrams(tokenized_corpus, n)))

frequencies = Counter(all_ngrams)
trivially_shared_ngrams = dict(frequencies.most_common(k))
# show the first n itmes
for k, v in list(trivially_shared_ngrams.items())[:10]: print(f"{k}: {v}")

('=',): 3654
('{',): 3653
('}',): 3625
('if',): 1452
('new',): 1156
('//',): 1011
('return',): 922
('public',): 775
('}', '}'): 762
('+',): 756


In [30]:
# NOTE: run this cell only for RQ2
# Calculate BLEU, CrystalBLEU, and line-level exact match (in terms of precision, recall, and f-score) on the whole code (not diff)
REMOVE_ICD = True # remove imports, comments, and docstrings (javadoc)

randIndex_reference = {}
randIndex_input = {}
for i in df['rand_index']:
    randIndex_reference[i] = df[df['rand_index']==i]['containing_method_after_repayment'].tolist()[0]
    randIndex_input[i] = df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]

data = []
model_template_metric_randIndex = {}
for model_name in dataset_models[DATASET]:
    model_template_metric_randIndex[model_name] = {}
    for prompt_template in dataset_prompts[DATASET]:
        model_template_metric_randIndex[model_name][prompt_template] = {}
        # load and prepare model answer
        # randIndex_answer = load_from_json(f'/content/generated_outputs/{DATASET}/{model_name}/answers_10to2157_{prompt_template}.json')
        file_path = f'/home/jovyan/SATD-Repayment/AutoRepayment-ZeroShot/generated_outputs/{DATASET}/{model_name}/answers_{dataset_testpart[DATASET]}_{prompt_template}.json'
        if os.path.exists(file_path)==False:
            continue
        randIndex_answer = load_from_json(file_path)
        randIndex_answer = {int(key): value for key, value in randIndex_answer.items()}
        randIndex_answer = {k: v for k, v in randIndex_answer.items() if k in set(df['rand_index'])} # filter out items not exist in df        
        randIndex_answercode = {i:extract_code(answer, LANGUAGE) for i,answer in randIndex_answer.items()}
        # calculate BLEU
        BLEU = get_BLEU(randIndex_reference, randIndex_answercode, LANGUAGE, REMOVE_ICD)
        model_template_metric_randIndex[model_name][prompt_template]['BLEU'] = BLEU
        avg_BLEU = sum(BLEU.values()) / len(BLEU)
        # calculate CrystalBLEU
        crystalBLEU = get_crystalBLEU(randIndex_reference, randIndex_answercode, LANGUAGE, REMOVE_ICD, trivially_shared_ngrams)
        model_template_metric_randIndex[model_name][prompt_template]['CrystalBLEU'] = crystalBLEU
        avg_crystalBLEU = sum(crystalBLEU.values()) / len(crystalBLEU)
        # calculate lineP, lineR, lineF
        lineP, lineR, lineF = get_linePRF(randIndex_reference, randIndex_answercode, LANGUAGE, REMOVE_ICD)
        model_template_metric_randIndex[model_name][prompt_template]['lineP'] = lineP
        model_template_metric_randIndex[model_name][prompt_template]['lineR'] = lineR
        model_template_metric_randIndex[model_name][prompt_template]['lineF'] = lineF
        avg_lineP = sum(lineP.values()) / len(lineP)
        avg_lineR = sum(lineR.values()) / len(lineR)
        avg_lineF = sum(lineF.values()) / len(lineF)
        # store data
        data.append([model_name, prompt_template, avg_BLEU, avg_crystalBLEU, avg_lineP, avg_lineR, avg_lineF])
# convert data to df
df_BLEU = pd.DataFrame(data, columns=['Model', 'Template', 'BLEU', 'CrystalBLEU', 'LineP', 'lineR', 'lineF'])
display(df_BLEU)

print("\nBaseline - Use containing_method_before_repayment as candidate:")
BLEU = get_BLEU(randIndex_reference, randIndex_input, LANGUAGE, REMOVE_ICD)
crystalBLEU = get_crystalBLEU(randIndex_reference, randIndex_input, LANGUAGE, REMOVE_ICD, trivially_shared_ngrams)
lineP, lineR, lineF = get_linePRF(randIndex_reference, randIndex_input, LANGUAGE, REMOVE_ICD)
model_template_metric_randIndex['input'] = {'BLEU':BLEU, 'CrystalBLEU':crystalBLEU, 'lineP':lineP, 'lineR':lineR, 'lineF':lineF}
print(f'  Average BLEU: {sum(BLEU.values())/len(BLEU):.3f}')
print(f'  Average crystal BLEU: {sum(crystalBLEU.values())/len(crystalBLEU):.3f}')
print(f'  Average lineP: {sum(lineP.values())/len(lineP):.3f}')
print(f'  Average lineR: {sum(lineR.values())/len(lineR):.3f}')
print(f'  Average lineF: {sum(lineF.values())/len(lineF):.3f}')

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,Model,Template,BLEU,CrystalBLEU,LineP,lineR,lineF
0,Llama3.1-8B,Mastropaolo-T2,0.571558,0.60978,0.685796,0.635833,0.636843
1,Llama3.1-8B,NoExplain,0.580184,0.634841,0.646905,0.637976,0.624444
2,Llama3.1-8B,CoT1,0.521046,0.578498,0.579323,0.600717,0.565996
3,Llama3.1-8B,CoT2,0.514933,0.571604,0.549777,0.605577,0.553629
4,Gemma-2-9B,Mastropaolo-T2,0.584565,0.632057,0.736707,0.655819,0.671136
5,Gemma-2-9B,NoExplain,0.583495,0.636123,0.710583,0.633125,0.645579
6,Gemma-2-9B,CoT1,0.469336,0.527929,0.554741,0.544667,0.519439
7,Gemma-2-9B,CoT2,0.482686,0.541188,0.560446,0.55585,0.529159
8,DeepSeek-Coder-V2-Lite-Instruct,Mastropaolo-T2,0.571782,0.603556,0.715122,0.639117,0.650576
9,DeepSeek-Coder-V2-Lite-Instruct,NoExplain,0.58248,0.62709,0.662053,0.62641,0.621741



Baseline - Use containing_method_before_repayment as candidate:
  Average BLEU: 0.564
  Average crystal BLEU: 0.611
  Average lineP: 0.740
  Average lineR: 0.655
  Average lineF: 0.669


In [31]:
# Calculate BLEU-diff, CrystalBLEU-diff, and LEMOD 
import math
REMOVE_ICD = True # remove imports, comments, and docstrings (javadoc)

data_groups = ['all'] # ['easy','hard','all']
hard_group = df.loc[df['number_of_inserted_lines'] >= 3, 'rand_index'].tolist()

randIndex_reference = {}
for i in randIndex_answer.keys():
    if i in df['rand_index'].tolist():
        if REMOVE_ICD:
            if LANGUAGE=='Python':
                try:
                    containing_method_after_repayment = remove_imports(remove_docstrings_and_comments_by_ast_from_python(df[df['rand_index']==i]['containing_method_after_repayment'].tolist()[0]))
                    containing_method_before_repayment = remove_imports(remove_docstrings_and_comments_by_ast_from_python(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
                except:
                    containing_method_after_repayment = remove_imports(remove_docstrings_and_comments_by_regex_from_python(df[df['rand_index']==i]['containing_method_after_repayment'].tolist()[0]))
                    containing_method_before_repayment = remove_imports(remove_docstrings_and_comments_by_regex_from_python(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
            elif LANGUAGE=='Java':
                containing_method_after_repayment = remove_imports(remove_comments_and_javadoc_from_java(df[df['rand_index']==i]['containing_method_after_repayment'].tolist()[0]))
                containing_method_before_repayment = remove_imports(remove_comments_and_javadoc_from_java(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
            else:
                raise ValueError('Language not supported')
        else:
            containing_method_after_repayment = df[df['rand_index']==i]['containing_method_after_repayment'].tolist()[0]
            containing_method_before_repayment = df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]
        target_updated_or_new_lines = get_updated_or_new_lines(containing_method_before_repayment, containing_method_after_repayment, DATASET)
        target_updated_or_new_lines = "\n".join(target_updated_or_new_lines)
        randIndex_reference[i] = target_updated_or_new_lines

data = []
model_template_metric_randIndex = {}
for model_name in dataset_models[DATASET]: 
    if model_name not in model_template_metric_randIndex:
        model_template_metric_randIndex[model_name] = {}
    for prompt_template in dataset_prompts[DATASET]:
        for data_group in data_groups:
            # load and prepare model answer
            file_path = f'{OUTPUT_DIR}/{DATASET}/{model_name}/answers_{dataset_testpart[DATASET]}_{prompt_template}.json'
            if os.path.exists(file_path)==False:
                continue
            if prompt_template not in model_template_metric_randIndex[model_name]:
                model_template_metric_randIndex[model_name][prompt_template] = {}        
            randIndex_answer = load_from_json(file_path)
            randIndex_answer = {int(key): value for key, value in randIndex_answer.items()}
            randIndex_answer = {k: v for k, v in randIndex_answer.items() if k in set(df['rand_index'])} # filter out items not exist in df
            randIndex_answercode = {i:extract_code(answer, LANGUAGE) for i,answer in randIndex_answer.items()}
            # create randIndex_candidate
            randIndex_candidate = {}
            for i in randIndex_answer.keys():
                if data_group=='all' or (data_group=='hard' and i in hard_group) or (data_group=='easy' and i not in hard_group):
                    if REMOVE_ICD:
                        if LANGUAGE=='Python':
                            try:
                                answercode = remove_imports(remove_docstrings_and_comments_by_ast_from_python(randIndex_answercode[i]))
                                containing_method_before_repayment = remove_imports(remove_docstrings_and_comments_by_ast_from_python(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
                            except:
                                answercode = remove_imports(remove_docstrings_and_comments_by_regex_from_python(randIndex_answercode[i]))
                                containing_method_before_repayment = remove_imports(remove_docstrings_and_comments_by_regex_from_python(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
                        elif LANGUAGE=='Java':
                            answercode = remove_imports(remove_comments_and_javadoc_from_java(randIndex_answercode[i]))
                            containing_method_before_repayment = remove_imports(remove_comments_and_javadoc_from_java(df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]))
                        else:
                            raise ValueError('Language not supported')
                    else:
                        answercode = randIndex_answercode[i]
                        containing_method_before_repayment = df[df['rand_index']==i]['containing_method_before_repayment'].tolist()[0]
                    model_updated_or_new_lines = get_updated_or_new_lines(containing_method_before_repayment, answercode, DATASET)
                    model_updated_or_new_lines = "\n".join(model_updated_or_new_lines)
                    randIndex_candidate[i] = model_updated_or_new_lines
            # calculate BLEU
            BLEU = get_BLEU(randIndex_reference, randIndex_candidate, LANGUAGE, REMOVE_ICD)
            model_template_metric_randIndex[model_name][prompt_template]['BLEU_diff'] = BLEU
            avg_BLEU = sum(BLEU.values()) / len(BLEU)
            # calculate crystalBLEU
            crystalBLEU = get_crystalBLEU(randIndex_reference, randIndex_candidate, LANGUAGE, REMOVE_ICD, trivially_shared_ngrams)
            model_template_metric_randIndex[model_name][prompt_template]['CrystalBLEU_diff'] = crystalBLEU
            avg_crystalBLEU = sum(crystalBLEU.values()) / len(crystalBLEU)
            # calculate precision, recall, F1 over lines
            lineP, lineR, lineF = get_linePRF(randIndex_reference, randIndex_candidate, LANGUAGE, REMOVE_ICD)
            model_template_metric_randIndex[model_name][prompt_template]['lineP_diff'] = lineP
            model_template_metric_randIndex[model_name][prompt_template]['lineR_diff'] = lineR
            model_template_metric_randIndex[model_name][prompt_template]['lineF_diff'] = lineF
            avg_lineP = sum(lineP.values()) / len(lineP)
            avg_lineR = sum(lineR.values()) / len(lineR)
            avg_lineF = sum(lineF.values()) / len(lineF)
            # store data
            data.append([model_name, prompt_template, data_group, avg_BLEU, avg_crystalBLEU, avg_lineP, avg_lineR, avg_lineF])
# convert data to df
df_BLEU_diff_ignor_comments = pd.DataFrame(data, columns=['Model', 'Template', 'Group', 'BLEU', 'CrystalBLEU', 'lineP', 'lineR', 'lineF'])
display(df_BLEU_diff_ignor_comments)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,Model,Template,Group,BLEU,CrystalBLEU,lineP,lineR,lineF
0,Llama3.1-8B,Mastropaolo-T2,all,0.180133,0.216478,0.333479,0.236255,0.232653
1,Llama3.1-8B,NoExplain,all,0.291211,0.35499,0.407989,0.347778,0.33561
2,Llama3.1-8B,CoT1,all,0.296959,0.373835,0.364941,0.399506,0.334751
3,Llama3.1-8B,CoT2,all,0.292113,0.371757,0.342464,0.404261,0.322776
4,Gemma-2-9B,Mastropaolo-T2,all,0.172749,0.198981,0.333207,0.199291,0.219007
5,Gemma-2-9B,NoExplain,all,0.278142,0.33525,0.467546,0.327684,0.340716
6,Gemma-2-9B,CoT1,all,0.294469,0.370684,0.348414,0.422225,0.332481
7,Gemma-2-9B,CoT2,all,0.297856,0.371265,0.353498,0.41412,0.335022
8,DeepSeek-Coder-V2-Lite-Instruct,Mastropaolo-T2,all,0.140643,0.162863,0.237744,0.185012,0.174336
9,DeepSeek-Coder-V2-Lite-Instruct,NoExplain,all,0.263212,0.326643,0.390297,0.338133,0.31262
