In [1]:
%%capture
%pip install -U transformers accelerate

In [2]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [5]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [6]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [7]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

In [8]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msiddharthyadav555[0m ([33msiddharthyadav555-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [10]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-BugLocalizer-New"
dataset_name = "/kaggle/input/sample20/Train Data.xlsx"

In [11]:
import torch
print(torch.cuda.is_available())

True


In [12]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [13]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the dataset from the Excel file
df = pd.read_excel('/kaggle/input/sample20/Train Data.xlsx')

# Clean the dataframe by removing any rows with NaN values and converting to strings
df = df.dropna()
df['Buggy Code'] = df['Buggy Code'].astype(str)
df['Buggy Line'] = df['Buggy Line'].astype(str)

# Convert directly to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Define the system instruction
instruction = "You are an Analyst who always extracts one buggy line in the provided Python code."

# Define the function to format each snippet into the chat template
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f"Current Buggy Code:\n{row['Buggy Code']}"},
        {"role": "assistant", "content": f"The buggy line:\n{row['Buggy Line']}"}
    ]
    # Apply the chat template using the tokenizer
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting function to the dataset
dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/260 [00:00<?, ? examples/s]

In [15]:
print(f"Total number of snippets loaded: {len(dataset)}")
# Print the first 5 entries
for idx in range(5):
    row = dataset[idx]
    print(f"Snippet {idx + 1}:")
    print("Buggy Code:")
    print(row['Buggy Code'])
    print("Buggy Line:")
    print(row['Buggy Line'])
    print('-' * 80)

Total number of snippets loaded: 260
Snippet 1:
Buggy Code:
def lis(arr):
    ends = {}
    longest = 0
    for i, val enumerate(arr):
        prefix_lengths = [j for j in range(1, longest + 1) if arr[ends[j]] < val]
        length = max(prefix_lengths) if prefix_lengths else 0
        if length == longest or val < arr[ends[length + 1]]:
            ends[length + 1] = i
            longest = max(longest, length + 1)
    return longest
Buggy Line:



    for i, val enumerate(arr):






--------------------------------------------------------------------------------
Snippet 2:
Buggy Code:
def longest_common_subsequence(a, b):
    if not a or not b:
        return ''
    elif a[0] == b[0]:
        return b[0] + longest_common_subsequence(a[1:], b[1:])
    else:
        return max(
            longest_common_subsequence(a, b[1:]),
            longest_common_subsequence(a[1:], b),
            key=len
        )
Buggy Line:




        return b[0] + longest_common_subsequence(a[1:], b[1:])



In [16]:
dataset['text'][3]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an Analyst who always extracts one buggy line in the provided Python code.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCurrent Buggy Code:\nfrom collections import deque as Queue\ndef breadth_first_search(startnode, goalnode):\n    queue = Queue()\n    queue.append(startnode)\n    nodesseen = set()\n    nodesseen.add(startnode)\n    while queue:\n        node = queue.popleft()\n        if node is goalnode:\n            return True\n        else:\n            queue.extend(node for node in node.successors if node not in nodesseen)\n            nodesseen = node.successors\n    return False<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe buggy line:\n\n\n\n\n\n\n\n\n\n\n\n\n            nodesseen = node.successors<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [17]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [18]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

ValueError: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None

In [21]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [22]:
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [23]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length= 1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [24]:
print(dataset)

Dataset({
    features: ['Buggy Code', 'Buggy Line', 'Correct Code', 'text'],
    num_rows: 260
})


In [27]:
trainer.train()

Step,Training Loss,Validation Loss
24,0.6087,0.552115
48,0.4495,0.371029
72,0.1506,0.225271
96,0.2497,0.200727


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=117, training_loss=0.47542451092830074, metrics={'train_runtime': 302.0638, 'train_samples_per_second': 0.775, 'train_steps_per_second': 0.387, 'total_flos': 593136495925248.0, 'train_loss': 0.47542451092830074, 'epoch': 1.0})

In [28]:
wandb.finish()

0,1
eval/loss,█▄▁▁
eval/runtime,▁▅▇█
eval/samples_per_second,█▄▂▁
eval/steps_per_second,█▄▂▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▆▄▄▆▄▄▃▅▇█▇▇▃▃▃▂▂▃▅▂▃▇▃▃▄▂▂▇▃▂▄▁▂▂▃▂▄▂▁
train/learning_rate,▄▅▅▆▇███▇▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁
train/loss,███▆▅▃▃▂▃▃▃▃▃▃▂▂▂▂▂▃▃▃▁▂▃▂▁▁▂▁▂▂▁▁▁▁▁▁▁▁

0,1
eval/loss,0.20073
eval/runtime,15.2105
eval/samples_per_second,1.709
eval/steps_per_second,1.709
total_flos,593136495925248.0
train/epoch,1.0
train/global_step,117.0
train/grad_norm,0.63781
train/learning_rate,0.0
train/loss,0.1511


In [29]:
print("Model Configuration:")
print(model.config)

# If using PEFT, check the PEFT configuration
if hasattr(model, 'peft_config'):
    print("PEFT Configuration:")
    print(model.peft_config)

Model Configuration:
LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/kaggle/input/llama-3.2/transformers/3b-instruct/1",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pad_token_id": [
    128001,
    128008,
    128009
  ],
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "l

In [30]:
test_buggy_code = """
"def bitcount(n):
count = 0
while n:
n &= n - 1
count += 1
return count * 2"
""" 
print(instruction)
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": f"Current Buggy Code:\n{test_buggy_code}"}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=1024, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are an Analyst who always extracts one buggy line in the provided Python code.


The buggy line:


    count += 1


In [31]:
import pandas as pd
import torch

# Load the Excel file
file_path = '/kaggle/input/sample21/CSV to XLSX Test Data.xlsx'
data = pd.read_excel(file_path)

# Extract the 'Buggy Code' column and drop any NaN values
buggy_code_snippets = data["Buggy Code"].dropna().tolist()

def query_model_for_snippet(snippet):
    messages = [
        {"role": "system", "content": "You are an Analyst who always extracts one buggy line in the provided Python code."},
        {"role": "user", "content": f"Please analyze the following code and locate the buggy line. Only output the buggy line and nothing else under the text 'Buggy Line:' on a new line:\n\n```python\n{snippet}\n```"}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
    
    # Generate output using the fine-tuned model
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=120,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the generated output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Original results processing section with minimal modifications
results = []
for i, snippet in enumerate(buggy_code_snippets, start=1):
    print(f"Processing Snippet {i}...")
    # Query the model for the response
    response = query_model_for_snippet(snippet)
    
    # Debugging: Print the raw model response
    print(f"Raw Model Response for Snippet {i}:\n{response}\n{'-' * 50}")
    
    # Extract the buggy line from the response
    buggy_lines = []
    lines = response.split("\n")  # Split response into individual lines
    capture = False
    for line in lines:
        stripped_line = line.strip()
        # Check if we have found the "Buggy Line:" marker
        if stripped_line.startswith("Buggy Line:"):
            # Check if there's more content after "Buggy Line:"
            parts = stripped_line.split("Buggy Line:")
            # parts[0] should be empty, parts[1] might contain the buggy line
            if len(parts) > 1 and parts[1].strip():
                # Buggy line is on the same line
                buggy_lines.append(parts[1].strip())
                break
            else:
                # Buggy line should be in subsequent lines
                capture = True
                continue

        if capture:  # Capture the line(s) after "Buggy Line:"
            if line.startswith("```"):  # Skip opening/closing backticks
                continue
            if line.endswith("```"):  # Stop capturing at closing backticks
                buggy_lines.append(line.replace("```", "").strip())
                break
            buggy_lines.append(stripped_line)

    # Combine buggy lines if there are multiple
    filtered_response = "\n".join(buggy_lines)
    
    # If we found a buggy line, append it to the results
    if filtered_response:
        results.append({"Snippet": snippet, "Buggy Line": filtered_response})
        print(f"Filtered Buggy Line(s) for Snippet {i}:\n{filtered_response}\n{'-' * 50}")
    else:
        print(f"No buggy line found for Snippet {i}\n{'-' * 50}")

# Save the results to an Excel file
results_df = pd.DataFrame(results)
output_path = "/kaggle/working/Fine_Tuned_Responses_new.xlsx"
results_df.to_excel(output_path, index=False, sheet_name="Buggy Lines")
print(f"Filtered buggy lines saved to: {output_path}")

Processing Snippet 1...
Raw Model Response for Snippet 1:
system

You are an Analyst who always extracts one buggy line in the provided Python code.user

Please analyze the following code and locate the buggy line. Only output the buggy line and nothing else under the text 'Buggy Line:' on a new line:

```python
def pascal(n):
    rows = [[1]]
    for r in range(1, n):
        row = []
        for c in range(0, r + 1):
            upleft = rows[r - 1][c - 1] if c > 0 else 0
            upright = rows[r - 1][c - 1] if c < r else 0
            row.append(upleft + upright)
        rows.append(row)
    return rows
```assistant

Buggy Line:
            upleft = rows[r - 1][c - 1] if c > 0 else 0
--------------------------------------------------
Filtered Buggy Line(s) for Snippet 1:
upleft = rows[r - 1][c - 1] if c > 0 else 0
--------------------------------------------------
Processing Snippet 2...
Raw Model Response for Snippet 2:
system

You are an Analyst who always extracts one buggy l

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)