In [None]:
from time import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from IPython.display import display, Markdown

In [None]:
%%capture
!pip install rouge

In [None]:
import json
import re
import time
import traceback
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')

In [None]:
model_id = "/kaggle/input/llama-3.1/transformers/8b/1"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


In [None]:
nlp_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    do_sample=True,          
    temperature=0.3,        
    top_p=0.85,              
    max_new_tokens=50,       
    pad_token_id=tokenizer.eos_token_id,  
    eos_token_id=tokenizer.eos_token_id   
)


In [None]:
input_file = '/kaggle/input/javafull/java.jsonl'  
output_file = '/kaggle/working/javaOutputLLamav_FULL.jsonl'

In [None]:
with open(input_file, 'r', encoding='UTF-8') as f:
    json_data = f.readlines()


In [None]:
def clean_model_output(output):
  
   
    if "Commit message:" in output:
        output = output.split("Commit message:")[-1].strip()
    
    # Optionally clean any quotes or unexpected formatting
    start = output.find('\"')
    end = output.rfind('\"')
    if start != -1 and end != -1:
        output = output[start+1:end].strip()

    # Ensure the output is clean and concise
    if "." in output:
        output = output.split(".")[0] + "."

    return output.strip()  # Final cleanup

def query_model(diff):
    """Query the model with specific parameters to control the output."""
    try:
        prompt = (
            f"{diff}\n"
            "###\n"
            "Commit message:"
        )

        response = nlp_pipeline(
            prompt,
            max_new_tokens=30, 
            stop_sequence=["###", "\n"],  
            temperature=0.2 
        )[0]['generated_text']

     
        msgGPT = clean_model_output(response)

        return msgGPT
    except Exception as e:
        print("Error:", str(e))
        traceback.print_exc()
        return None


In [None]:
def calculate_meteor(sentence1, sentence2):
    vectorizer = CountVectorizer().fit([sentence1, sentence2])
    sentence1_vector = vectorizer.transform([sentence1])
    sentence2_vector = vectorizer.transform([sentence2])
    similarity = cosine_similarity(sentence1_vector, sentence2_vector)[0][0]
    score = 2 * similarity * len(sentence1) * len(sentence2) / (len(sentence1) + len(sentence2))
    return score

def calculate_bleu(reference, translation):
    bleu_score = sentence_bleu([reference], translation)
    return bleu_score

def calculate_rouge_l(reference, translation):
    rouge = Rouge()
    rouge_l_score = rouge.get_scores(translation, reference, avg=True)['rouge-l']
    return rouge_l_score

In [None]:
def is_camel_case(s):
    return s != s.lower() and s != s.upper() and "_" not in s

def to_Underline(x):
    return re.sub('(?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])', ' \g<0>', x).lower()

def get_tokens(text):
    tokens = nltk.word_tokenize(text)
    if len(tokens) > 1024:
        return ' '.join(tokens[:1024])
    else:
        return ' '.join(tokens)

def remove_between_identifiers(text, identifier_start, identifier_end):
    pattern = f'(?<={identifier_start}).*?(?={identifier_end})'
    result = re.sub(pattern, '', text)
    result = result.replace(' . ', '.').replace('  ', ' ').replace(' = ', '=').replace(' ; ', ';')
    return result


In [None]:
initial_data = {"diff_id": 0, "msg": "0", "msgGPT": "0", "METEOR Score": "0", "BLEU Score": "0", "ROUGE-L Score": "0"}
with open(output_file, 'a', encoding='UTF-8') as f:
    json.dump(initial_data, f)
    f.write('\n')

In [None]:

for item in json_data:
    attempts = 0
    while attempts < 3:
        try:
            data = json.loads(item)
            diff_id = data['diff_id']
            diff = data['diff']

           
            result = remove_between_identifiers(diff, 'mmm a', '<nl>')
            diff = get_tokens(remove_between_identifiers(result, 'ppp b', '<nl>'))
            msg = data['msg']

           
            words = msg.split()
            msg_list = [to_Underline(word) if is_camel_case(word) else word for word in words]
            msg = ' '.join(msg_list)

           
            raw_output = query_model(diff)
            if raw_output is None:
                attempts += 1
                continue

           
            msgGPT = clean_model_output(raw_output)

            
            bleu_score = calculate_bleu(msg, msgGPT)
            rouge_l_score = calculate_rouge_l(msg, msgGPT)
            meteor_score = calculate_meteor(msg, msgGPT)

        
            merged_dict = {
                "diff_id": diff_id,
                "msg": msg,
                "msgGPT": msgGPT,
                "METEOR Score": meteor_score,
                "BLEU Score": bleu_score,
                "ROUGE-L Score": rouge_l_score['f']
            }

            with open(output_file, 'a', encoding='UTF-8') as f:
                json.dump(merged_dict, f)
                f.write('\n')

            
            time.sleep(5)
            break

        except Exception as e:
            print(f"Error processing diff_id {diff_id}: {str(e)}")
            traceback.print_exc()
            attempts += 1
            if attempts == 3:
                print(f"Failed to process item after 3 attempts: {item}")


In [None]:

for item in json_data:
    attempts = 0
    while attempts < 3:
        try:
            data = json.loads(item)
            diff_id = data['diff_id']
            diff = data['diff']

          
            result = remove_between_identifiers(diff, 'mmm a', '<nl>')
            diff = get_tokens(remove_between_identifiers(result, 'ppp b', '<nl>'))
            msg = data['msg']

           
            words = msg.split()
            msg_list = [to_Underline(word) if is_camel_case(word) else word for word in words]
            msg = ' '.join(msg_list)

          
            raw_output = query_model(diff)
            if raw_output is None:
                attempts += 1
                continue

            
            msgGPT = clean_model_output(raw_output)

           
            bleu_score = calculate_bleu(msg, msgGPT)
            rouge_l_score = calculate_rouge_l(msg, msgGPT)
            meteor_score = calculate_meteor(msg, msgGPT)

          
            merged_dict = {
                "diff_id": diff_id,
                "msg": msg,
                "msgGPT": msgGPT,
                "METEOR Score": meteor_score,
                "BLEU Score": bleu_score,
                "ROUGE-L Score": rouge_l_score['f']
            }

            with open(output_file, 'a', encoding='UTF-8') as f:
                json.dump(merged_dict, f)
                f.write('\n')

           
            time.sleep(5)
            break

        except Exception as e:
            print(f"Error processing diff_id {diff_id}: {str(e)}")
            traceback.print_exc()
            attempts += 1
            if attempts == 3:
                print(f"Failed to process item after 3 attempts: {item}")


In [None]:
for item in json_data:
    attempts = 0
    while attempts < 3:
        try:
           
            data = json.loads(item)
            diff_id = data['diff_id']
            diff = data['diff']
            
            
            result = remove_between_identifiers(diff, 'mmm a', '<nl>')
            diff = get_tokens(remove_between_identifiers(result, 'ppp b', '<nl>'))
            msg = data['msg']

          
            words = msg.split()
            msg_list = [to_Underline(word) if is_camel_case(word) else word for word in words]
            msg = ' '.join(msg_list)

           
            prompt = (
                f"{diff}\n"
                "###\n"
                "Please write a commit message that summarizes the code change above in a single sentence. "
                "The message should be concise, clear, and avoid general statements."
            )

          
            response = nlp_pipeline(
                prompt,
                max_new_tokens=50
            )[0]['generated_text']

           
            msgGPT = response.strip().split('\n')[-1].strip()

          
            bleu_score = round(calculate_bleu(msg, msgGPT), 2)
            rouge_l_score = round(calculate_rouge_l(msg, msgGPT)['f'], 2)
            meteor_score = round(calculate_meteor(msg, msgGPT), 2)

           
            merged_data = {
                "diff_id": diff_id,
                "msg": msg,
                "msgGPT": msgGPT,
                "METEOR Score": f"{meteor_score}",
                "BLEU Score": f"{bleu_score}",
                "ROUGE-L Score": f"{rouge_l_score}"
            }

           
            with open(output_file, 'a', encoding='UTF-8') as f:
                json.dump(merged_data, f)
                f.write('\n')
                
           
            time.sleep(2)
            break

        except Exception as e:
            traceback.print_exc()
            attempts += 1
            if attempts == 3:
                print(f"Failed to process item after 3 attempts: {item}")


In [None]:

test_data = {
    "diff_id": 44,
    "repo": "oracle/graal\n",
    "sha": "80c597d0510090d7b278ff0890db3ce303776f5f\n",
    "time": "2020-02-14T16:11:58Z\n",
    "diff": "mmm a / src / com . oracle . truffle . espresso / src / com / oracle / truffle / espresso / nodes / EspressoRootNode . java <nl> ppp b / src / com . oracle . truffle . espresso / src / com / oracle / truffle / espresso / nodes / EspressoRootNode . java <nl> public Object execute ( VirtualFrame frame ) { <nl> BytecodeNode bytecodeNode = getBytecodeNode ( ) ; <nl> bytecodeNode . methodMonitorEnter ( frame , monitor ) ; <nl> } else { <nl> + / / TODO ( Gregersen ) - register monitors on frames for non - bytecode methods <nl> InterpreterToVM . monitorEnter ( monitor ) ; <nl> } <nl> Object result ; <nl> public Object execute ( VirtualFrame frame ) { <nl> getBytecodeNode ( ) . monitorExit ( frame , monitor ) ; <nl> } else { <nl> + / / TODO ( Gregersen ) - exit monitors on frames for non - bytecode methods <nl> InterpreterToVM . monitorExit ( monitor ) ; <nl> } <nl> } <nl>\n",
    "msg": "Add a few todos for implementing monitor lookup ownership on native method frames\n"
}


result = remove_between_identifiers(test_data['diff'], 'mmm a', '<nl>')
processed_diff = get_tokens(remove_between_identifiers(result, 'ppp b', '<nl>'))
msg = test_data['msg']


prompt = (
    f"Code change:\n{processed_diff}\n"
    "Commit message:"
)

response = nlp_pipeline(
    prompt,
    max_new_tokens=50
)[0]['generated_text']

msgGPT = response.strip().split('\n')[-1].strip()


print(f"Original Message: {msg}")
print(f"Generated Message: {msgGPT}")


bleu_score = round(calculate_bleu(msg, msgGPT), 2)
rouge_l_score = round(calculate_rouge_l(msg, msgGPT)['f'], 2)
meteor_score = round(calculate_meteor(msg, msgGPT), 2)

print(f"METEOR Score: {meteor_score}")
print(f"BLEU Score: {bleu_score}")
print(f"ROUGE-L Score: {rouge_l_score}")

In [None]:
lan = '/kaggle/input/java-dataset/java.jsonl'
with open(lan, 'r', encoding='UTF-8') as f:
    json_data = f.readlines()

out_filename = '/kaggle/working/java_result_13B_llama_3.jsonl'
