In [None]:
!pip install huggingface_hub
!pip install torch transformers accelerate bitsandbytes peft



In [7]:
# 4 bit quantized inference 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "meta-llama/CodeLlama-7b-Instruct-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                  # Load weights in 4-bit precision (very low memory use)
    bnb_4bit_use_double_quant=True,     # Additional optimization to further save memory
    bnb_4bit_quant_type="nf4",          # NormalFloat4: good precision/memory trade-off
    bnb_4bit_compute_dtype="float16",   # Computation done in float16
)

model = AutoModelForCausalLM.from_pretrained(
    "./quantized_model",
    #model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    "./quantized_model",
    #model_id,
)

# Save quantized model and tokenizer to disk
#model.save_pretrained("./quantized_model")
#tokenizer.save_pretrained("./quantized_model")




In [8]:
print(model.config)

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rm

In [9]:
DEFAULT_SYSTEM_PROMPT = """You are a commit risk analysis assistant.
Your job is to analyze a commit diff and determine if it introduces a software bug.

Respond only with 0 (clean) or 1 (risky). Do not include explanations.
"""

SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT

B_INST, E_INST = "[INST]", "[/INST]"  # for instruction models
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

print(SYSTEM_PROMPT)

You are a commit risk analysis assistant.
Your job is to analyze a commit diff and determine if it introduces a software bug.

Respond only with 0 (clean) or 1 (risky). Do not include explanations.



In [10]:
pre_prompt = """ # commit info:

Title: Fix API version in pom.xml  Change-Id: Id96d71ccb150c18a15291c01296a8152c6ec3eb0
Diff:
diff --git a/pom.xml b/pom.xml
index d33e954..ddd1e98 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@ limitations under the License.
   <groupId>com.googlesource.gerrit.plugins</groupId>
   <artifactId>delete-project</artifactId>
   <packaging>jar</packaging>
-  <version>2.12-SNAPSHOT</version>
+  <version>2.12</version>
   <properties>
     <Gerrit-ApiType>plugin</Gerrit-ApiType>
     <Gerrit-ApiVersion>${project.version}</Gerrit-ApiVersion>
# risk value:
"""


In [11]:
prompt = f"{B_INST} {B_SYS}{SYSTEM_PROMPT}{E_SYS}{pre_prompt} {E_INST}"

In [12]:
import torch

runtimeFlag = "cuda"

if not torch.cuda.is_available():
    raise ValueError("CUDA GPU is not available. Ensure GPU and CUDA are properly installed.")

tokenizer.pad_token = tokenizer.eos_token

inputs = tokenizer(
    prompt,
    return_tensors="pt",
    add_special_tokens=True,
    padding=True  # ensures attention mask is properly generated
).to(runtimeFlag)

inputs

{'input_ids': tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13,  3492,
           526,   263,  9063, 12045,  7418, 20255, 29889,    13, 10858,  4982,
           338,   304, 27599,   263,  9063,  2923,   322,  8161,   565,   372,
          4547,   778,   263,  7047,  6494, 29889,    13,    13,  1666,  2818,
           871,   411, 29871, 29900,   313, 14941, 29897,   470, 29871, 29896,
           313,  3780,  3459,   467,  1938,   451,  3160,  7309,   800, 29889,
            13,    13, 29966,   829, 14816, 29903,  6778,    13,    13,   396,
          9063,  5235, 29901,    13,    13,  7030, 29901, 24778,  3450,  1873,
           297,  8280, 29889,  3134, 29871, 10726, 29899,  1204, 29901,  5163,
         29929, 29953, 29881, 29955, 29896,   617, 29890, 29896, 29945, 29900,
         29883, 29896, 29947, 29874, 29896, 29945, 29906, 29929, 29896, 29883,
         29900, 29896, 29906, 29929, 29953, 29874, 29947, 29896, 29945, 29906,
         29883, 29953,   687, 29941,  

In [13]:
output = model.generate(
    **inputs,
    max_new_tokens=1,            # expecting a single token ("0" or "1")
    do_sample=False,             # deterministic output (no randomness)
    pad_token_id=tokenizer.eos_token_id  # clearly specify pad token
)

# Decode and print the result
result = tokenizer.decode(output[0], skip_special_tokens=True)
response = result.strip()[-1] if result.strip()[-1] in {"0", "1"} else "?"
print("Predicted label:", response)

Predicted label: ?


In [14]:
print("Full model response:", result.strip())


Full model response: [INST] <<SYS>>
You are a commit risk analysis assistant.
Your job is to analyze a commit diff and determine if it introduces a software bug.

Respond only with 0 (clean) or 1 (risky). Do not include explanations.

<</SYS>>

 # commit info:

Title: Fix API version in pom.xml  Change-Id: Id96d71ccb150c18a15291c01296a8152c6ec3eb0
Diff:
diff --git a/pom.xml b/pom.xml
index d33e954..ddd1e98 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@ limitations under the License.
   <groupId>com.googlesource.gerrit.plugins</groupId>
   <artifactId>delete-project</artifactId>
   <packaging>jar</packaging>
-  <version>2.12-SNAPSHOT</version>
+  <version>2.12</version>
   <properties>
     <Gerrit-ApiType>plugin</Gerrit-ApiType>
     <Gerrit-ApiVersion>${project.version}</Gerrit-ApiVersion>
# risk value:
 [/INST]


In [None]:
#Prepare Dataset 

In [14]:
import re
import pandas as pd

# Load the raw text data from file
with open('G:/defect-prediction-project/datasets/finetune_data/train_openllama_go.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()


In [28]:
import re
import json
from sklearn.model_selection import train_test_split

# Extract data from your raw text
pattern = r'\[DEFECT\]\nTitle:(.*?)\nDiff:\n(.*?)\[/DEFECT\]\n(\d)'
matches = re.findall(pattern, raw_text, re.DOTALL)

data = []
for match in matches:
    commit_info = match[0].strip()
    diff = match[1].strip()
    label = int(match[2].strip())
    data.append({'commit_info': f'Title: {commit_info}', 'diff': diff, 'label': label})

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Write directly to JSON files (no JSONL)
with open('train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, indent=2)

with open('validation.json', 'w', encoding='utf-8') as f:
    json.dump(val_data, f, indent=2)


In [36]:
%pip install -U datasets

Note: you may need to restart the kernel to use updated packages.


In [15]:
from datasets import load_dataset

dataset = load_dataset('json', data_files={
    'train': 'train.json',
    'validation': 'validation.json'
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['commit_info', 'diff', 'label'],
        num_rows: 14703
    })
    validation: Dataset({
        features: ['commit_info', 'diff', 'label'],
        num_rows: 3676
    })
})


In [31]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

DEFAULT_SYSTEM_PROMPT = """You are a commit risk analysis assistant.
Your job is to analyze a commit diff and determine if it introduces a software bug.

Respond only with 0 (clean) or 1 (risky). Do not include explanations.
"""

SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT

def format_prompt(data_point):
    return f"{B_INST} {B_SYS}{SYSTEM_PROMPT}{E_SYS}# commit info:\n{data_point['commit_info']}\n\nDiff:\n{data_point['diff']}\n\n# risk value:\n{data_point['label']} {E_INST}"


In [32]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [33]:
def generate_and_tokenize_prompt(data_batch):
    prompts = [
        format_prompt({"commit_info": ci, "diff": d, "label": l})
        for ci, d, l in zip(data_batch["commit_info"], data_batch["diff"], data_batch["label"])
    ]

    tokenized = tokenizer(
        prompts,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
        add_special_tokens=True
    )

    tokenized["labels"] = [ids.copy() for ids in tokenized["input_ids"]]

    return tokenized


In [34]:

tokenized_train_dataset = dataset['train'].map(
    generate_and_tokenize_prompt,
    batched=True,
    remove_columns=dataset["train"].column_names
)

tokenized_val_dataset = dataset['validation'].map(
    generate_and_tokenize_prompt,
    batched=True,
    remove_columns=dataset["validation"].column_names
)


Map:   0%|          | 0/14703 [00:00<?, ? examples/s]

Map:   0%|          | 0/3676 [00:00<?, ? examples/s]

In [None]:
print(tokenized_train_dataset[0])
print(tokenized_val_dataset[0])


{'input_ids': [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 9063, 12045, 7418, 20255, 29889, 13, 10858, 4982, 338, 304, 27599, 263, 9063, 2923, 322, 8161, 565, 372, 4547, 778, 263, 7047, 6494, 29889, 13, 13, 1666, 2818, 871, 411, 29871, 29900, 313, 14941, 29897, 470, 29871, 29896, 313, 3780, 3459, 467, 1938, 451, 3160, 7309, 800, 29889, 13, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29937, 9063, 5235, 29901, 13, 7030, 29901, 599, 29901, 2329, 4274, 7911, 1066, 29871, 263, 1599, 385, 29871, 10726, 29899, 1204, 29901, 306, 29929, 29945, 29874, 29929, 29946, 29900, 2176, 29953, 29946, 10702, 29947, 29906, 29945, 29947, 29947, 29955, 29890, 29955, 29945, 29874, 29947, 29900, 1479, 29883, 29947, 29906, 29906, 29900, 29929, 29945, 29890, 29946, 29929, 29955, 29947, 29896, 13957, 287, 29899, 265, 29901, 2045, 597, 1484, 29899, 27828, 29889, 1484, 468, 793, 1167, 29889, 510, 29914, 29953, 29941, 29929, 29929, 29896, 7525, 29899, 15870, 29933, 327, 29901, 4827, 478, 351,

In [38]:
#Setup QLoRA
output_dir = "G:/pie-perf"

In [None]:
from peft import LoraConfig, get_peft_model, TaskType


from peft import (
    LoraConfig,
    get_peft_model,
    # prepare_model_for_int8_training,
    prepare_model_for_kbit_training
)

model.train() # put model back into training mode
# model = prepare_model_for_int8_training(model)  # get the model ready for int8 quantization
model = prepare_model_for_kbit_training(model)  # get the model ready for int8 quantization

lora_config = LoraConfig(
    r=16,                    # Rank (controls low-rank decomposition size)
    lora_alpha=32,           # Scaling factor (commonly alpha=2*r)
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # targeted modules for LoRA
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Apply QLoRA to your model explicitly
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 16,777,216 || all params: 6,755,323,904 || trainable%: 0.2484


In [None]:
import os
from peft import set_peft_model_state_dict

# set this to the adapter_model.bin file you want to resume from (if any)
resume_from_checkpoint = "./qlora_commit_model/checkpoint-220/adapter_model.bin"

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

Checkpoint G:/pie-perf/checkpoint-220/adapter_model.bin not found


In [46]:
from datetime import datetime

from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

batch_size = 128
per_device_train_batch_size = 8
gradient_accumulation_steps = batch_size // per_device_train_batch_size

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        eval_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir="./qlora_commit_model",
        # save_total_limit=3,
        load_best_model_at_end=False,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none",
        run_name=None
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ))

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [48]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
20,1.7496,1.685997


: 

In [None]:
# Save the model
model.save_pretrained(output_dir)