<a href="https://www.kaggle.com/code/shadman200042144/dp-1-mistral-implementation?scriptVersionId=193693514" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install langchain-community langchain-core
!pip install -U bitsandbytes
!pip install accelerate
!pip install rouge

In [None]:
import torch
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
import huggingface_hub
huggingface_hub.login()
#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

In [None]:
import nltk
nltk.download('punkt')


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import json
import re
import time
import traceback
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Configure BitsAndBytes with CPU offloading for specific parts of the model
bnb_config = BitsAndBytesConfig(
    llm_int8_enable_fp32_cpu_offload=True,  # Enable FP32 offload to CPU
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0  # Default threshold for mixed precision
)
# Load the Mistral 7B model and tokenizer with a custom device map
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically distribute model layers across devices
)

# Create a pipeline for text generation
nlp_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define file paths
input_file = '/kaggle/input/java-dataset/java.jsonl'  # Your dataset
output_file = '/kaggle/working/javaOutput.jsonl' 

# Load dataset
with open(input_file, 'r', encoding='UTF-8') as f:
    json_data = f.readlines()

# Functions for evaluation metrics
def calculate_meteor(sentence1, sentence2):
    vectorizer = CountVectorizer().fit([sentence1, sentence2])
    sentence1_vector = vectorizer.transform([sentence1])
    sentence2_vector = vectorizer.transform([sentence2])
    similarity = cosine_similarity(sentence1_vector, sentence2_vector)[0][0]
    score = 2 * similarity * len(sentence1) * len(sentence2) / (len(sentence1) + len(sentence2))
    return score

def calculate_bleu(reference, translation):
    bleu_score = sentence_bleu([reference], translation)
    return bleu_score

def calculate_rouge_l(reference, translation):
    rouge = Rouge()
    rouge_l_score = rouge.get_scores(translation, reference, avg=True)['rouge-l']
    return rouge_l_score

# Preprocessing functions
def is_camel_case(s):
    return s != s.lower() and s != s.upper() and "_" not in s

def to_Underline(x):
    return re.sub('(?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])', ' \g<0>', x).lower()

def get_tokens(text):
    tokens = nltk.word_tokenize(text)
    if len(tokens) > 1024:
        return ' '.join(tokens[:1024])
    else:
        return ' '.join(tokens)

def remove_between_identifiers(text, identifier_start, identifier_end):
    pattern = f'(?<={identifier_start}).*?(?={identifier_end})'
    result = re.sub(pattern, '', text)
    result = result.replace(' . ', '.').replace('  ', ' ').replace(' = ', '=').replace(' ; ', ';')
    return result

# Initialize results file
initial_data = {"diff_id": 0, "msg": "0", "msgGPT": "0", "METEOR Score": "0", "BLEU Score": "0", "ROUGE-L Score": "0"}
with open(output_file, 'a', encoding='UTF-8') as f:
    json.dump(initial_data, f)
    f.write('\n')

# Iterate over dataset and process each item
for item in json_data:
    attempts = 0
    while attempts < 3:
        try:
            data = json.loads(item)
            diff_id = data['diff_id']
            diff = data['diff']
            result = remove_between_identifiers(diff, 'mmm a', '<nl>')
            diff = get_tokens(remove_between_identifiers(result, 'ppp b', '<nl>'))
            msg = data['msg']

            words = msg.split()
            msg_list = [to_Underline(word) if is_camel_case(word) else word for word in words]
            msg = ' '.join(msg_list)

            # Generate commit message using Mistral 7B
            response = nlp_pipeline(
                f"{diff}\nPlease write a commit message that contains only one simple sentence for the above code change.\n",
                max_new_tokens=50
            )[0]['generated_text']

            msgGPT = response.strip().split('\n')[-1].strip()

            # Calculate metrics
            bleu_score = round(calculate_bleu(msg, msgGPT), 2)
            rouge_l_score = round(calculate_rouge_l(msg, msgGPT)['f'], 2)
            meteor_score = round(calculate_meteor(msg, msgGPT), 2)

            # Merge and save results with cleaner format
            merged_data = {
                "diff_id": diff_id,
                "msg": msg,
                "msgGPT": msgGPT,
                "METEOR Score": f"{meteor_score}",
                "BLEU Score": f"{bleu_score}",
                "ROUGE-L Score": f"{rouge_l_score}"
            }

            with open(output_file, 'a', encoding='UTF-8') as f:
                json.dump(merged_data, f)
                f.write('\n')
            time.sleep(2)
            break

        except Exception as e:
            traceback.print_exc()
            attempts += 1
            if attempts == 3:
                print(f"Failed to process item after 3 attempts: {item}")