In [3]:
%%capture
%pip install -U transformers accelerate

In [4]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [7]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [8]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [10]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

In [11]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msiddharthyadav555[0m ([33msiddharthyadav555-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-BugFixer-New"
dataset_name = "/kaggle/input/sample22/Train Data.xlsx"

In [13]:
import torch
print(torch.cuda.is_available())

True


In [14]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [15]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load the tokenizer

# Load the dataset from the Excel file
df = pd.read_excel('/kaggle/input/sample22/Train Data.xlsx')

# Clean and process the dataframe
# Remove rows with NaN values in critical columns
df = df.dropna(subset=['Buggy Code', 'Buggy Line', 'Correct Code'])

# Convert columns to strings
df['Code Snippet'] = df['Buggy Code'].astype(str)
df['Buggy Lines'] = df['Buggy Line'].astype(str)
df['Correct Code'] = df['Correct Code'].astype(str)

# Convert the dataframe to a Hugging Face Dataset
dataset = Dataset.from_pandas(df[['Buggy Code', 'Buggy Line', 'Correct Code']])

# Define the system instruction
instruction = """You are an Analyst who identifies the buggy line in Python code and provides the corrected version.
Example:
Buggy code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Bug location: if num % i === 0:
Fixed code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i == 0:
            return False
    return True
"""

# Define the function to format each snippet into the chat template
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f"Buggy Code:\n{row['Buggy Code']}\n\nBuggy Line:\n{row['Buggy Line']}"},
        {"role": "assistant", "content": f"Corrected Code:\n{row['Correct Code']}"}
    ]
    # Apply the chat template using the tokenizer
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting function to the dataset
dataset = dataset.map(
    format_chat_template,
    num_proc=4  # Process in parallel
)

# Print the first 5 prompts to check
print("Sample Prompts:")
for i in range(min(5, len(dataset))):
    print(f"\nPrompt {i+1}:")
    print(dataset[i]['text'])

# Optional: Print total number of snippets
print(f"\nTotal number of snippets: {len(dataset)}")

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/260 [00:00<?, ? examples/s]

  self.pid = os.fork()


Sample Prompts:

Prompt 1:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an Analyst who identifies the buggy line in Python code and provides the corrected version.
Example:
Buggy code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Bug location: if num % i === 0:
Fixed code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i == 0:
            return False
    return True<|eot_id|><|start_header_id|>user<|end_header_id|>

Buggy Code:
def lis(arr):
    ends = {}
    longest = 0
    for i, val enumerate(arr):
        prefix_lengths = [j for j in range(1, longest + 1) if arr[ends[j]] < val]
        length = max(prefix_lengths) if prefix_lengths else 0
        if length == longest or val < arr[ends[length + 1]]:
            ends[length + 1] = i
            longest = max(longest, 

In [17]:
print(f"Total number of snippets loaded: {len(dataset)}")
# Print the first 5 entries
for idx in range(5):
    row = dataset[idx]
    print(f"Snippet {idx + 1}:")
    print("Buggy Code:")
    print(row['Buggy Code'])
    print("Buggy Line:")
    print(row['Buggy Line'])
    print("Correct Code:")
    print(row['Correct Code'])
    print('-' * 80)

Total number of snippets loaded: 260
Snippet 1:
Buggy Code:
def lis(arr):
    ends = {}
    longest = 0
    for i, val enumerate(arr):
        prefix_lengths = [j for j in range(1, longest + 1) if arr[ends[j]] < val]
        length = max(prefix_lengths) if prefix_lengths else 0
        if length == longest or val < arr[ends[length + 1]]:
            ends[length + 1] = i
            longest = max(longest, length + 1)
    return longest
Buggy Line:



    for i, val enumerate(arr):






Correct Code:
def lis(arr):
    ends = {}
    longest = 0
    for i, val in enumerate(arr):
        prefix_lengths = [j for j in range(1, longest + 1) if arr[ends[j]] < val]
        length = max(prefix_lengths) if prefix_lengths else 0
        if length == longest or val < arr[ends[length + 1]]:
            ends[length + 1] = i
            longest = max(longest, length + 1)
    return longest
--------------------------------------------------------------------------------
Snippet 2:
Buggy Code:
def longe

In [None]:
dataset['text'][3]

In [18]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [19]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

ValueError: Chat template is already added to the tokenizer. If you want to overwrite it, please set it to None

In [20]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [22]:
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [23]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length= 1024,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [24]:
print(dataset)

Dataset({
    features: ['Buggy Code', 'Buggy Line', 'Correct Code', 'text'],
    num_rows: 260
})


In [27]:
trainer.train()

Step,Training Loss,Validation Loss
24,0.2461,0.242616
48,0.1716,0.165653
72,0.1124,0.115155
96,0.1098,0.094216


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=117, training_loss=0.223919834718745, metrics={'train_runtime': 660.8904, 'train_samples_per_second': 0.354, 'train_steps_per_second': 0.177, 'total_flos': 1494602984196096.0, 'train_loss': 0.223919834718745, 'epoch': 1.0})

In [28]:
wandb.finish()

0,1
eval/loss,█▄▂▁
eval/runtime,█▃▂▁
eval/samples_per_second,▁▆▇█
eval/steps_per_second,▁▆▇█
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇████
train/grad_norm,▆▇▆█▄▅▆▅▂▂▂▂▂▂▂▂▂▅▃▃▁▂▃▁▃▅▆▃▂▂▂▂▃▁▂▃▂▁▂▂
train/learning_rate,▂▃▄██████▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▁▁▁▁
train/loss,█▇▇▇▆▄▅▃▂▃▂▂▂▂▄▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.09422
eval/runtime,30.8629
eval/samples_per_second,0.842
eval/steps_per_second,0.842
total_flos,1494602984196096.0
train/epoch,1.0
train/global_step,117.0
train/grad_norm,0.28758
train/learning_rate,0.0
train/loss,0.0786


In [None]:
print("Model Configuration:")
print(model.config)

# If using PEFT, check the PEFT configuration
if hasattr(model, 'peft_config'):
    print("PEFT Configuration:")
    print(model.peft_config)

In [29]:
test_buggy_code = """
"def bitcount(n):
count = 0
while n:
n &= n - 1
count += 1
return count * 2"
""" 
print(instruction)
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": f"Current Buggy Code:\n{test_buggy_code}"}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=1024, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are an Analyst who identifies the buggy line in Python code and provides the corrected version.
Example:
Buggy code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Bug location: if num % i === 0:
Fixed code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i == 0:
            return False
    return True



Corrected Code:
def bitcount(n):
    count = 0
    while n:
        n &= n - 1
        count += 1
    return count

Buggy Line:







    return count * 2


In [30]:
import pandas as pd
import torch
import os

# Load the Excel file
file_path = '/kaggle/input/sample23/CSV to XLSX Test Data.xlsx'
data = pd.read_excel(file_path)

def extract_corrected_code(response):
    lines = response.split('\n')
    start_idx = -1
    
    # Find the last occurrence of correction markers
    for i, line in enumerate(lines):
        if any(marker.lower() in line.lower() for marker in ["corrected code:", "fixed code:", "correction:"]):
            start_idx = i
    
    if start_idx != -1:
        corrected_lines = []
        for line in lines[start_idx + 1:]:
            # Stop at system/user/assistant markers or empty lines at the end
            if any(marker.lower() in line.lower() for marker in ["system", "user", "assistant"]) or (not line.strip() and not corrected_lines):
                continue
            if not line.strip() and corrected_lines:
                break
            if line.strip():
                corrected_lines.append(line)  # Keep original indentation
        
        # Remove any trailing empty lines
        while corrected_lines and not corrected_lines[-1].strip():
            corrected_lines.pop()
            
        return '\n'.join(corrected_lines)
    return ""

# Rest of the code remains the same
def query_model_for_correction(buggy_code, buggy_line):
   messages = [
       {"role": "system", "content": """You are a code fixing assistant. Your task is to identify and correct bugs in Python code.

Example:
Buggy code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Bug location: if num % i === 0:
Fixed code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Your task is to carefully analyze the given buggy code, identify the specific problematic line, and generate the correct, optimized version of the code."""},
       {"role": "user", "content": f"Buggy Code:\n{buggy_code}\n\nBuggy Line:\n{buggy_line}\n\nPlease provide the corrected version of this code."}
   ]
   
   prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
   inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
   
   with torch.no_grad():
       outputs = model.generate(
           **inputs,
           max_new_tokens=1024,
           do_sample=True,
           temperature=0.7,
           eos_token_id=tokenizer.eos_token_id,
           pad_token_id=tokenizer.pad_token_id
       )
   
   return tokenizer.decode(outputs[0], skip_special_tokens=True)

results = []

for idx, row in data.iterrows():
   print(f"Processing Snippet {idx + 1}...")
   
   buggy_code = row['Buggy Code']
   buggy_line = row['Buggy Line']
   
   if pd.isna(buggy_code) or pd.isna(buggy_line):
       continue
   
   response = query_model_for_correction(buggy_code, buggy_line)
   print(f"Raw Model Response for Snippet {idx + 1}:\n{response}\n{'-' * 50}")
   
   try:
       corrected_code = extract_corrected_code(response)
       if corrected_code:
           results.append({
               'Buggy Code': buggy_code,
               'Buggy Line': buggy_line,
               'Model Generated Correction': corrected_code,
               'Original Correct Code': row.get('Correct Code', '')
           })
           print(f"Corrected Code for Snippet {idx + 1}:\n{corrected_code}\n{'-' * 50}")
       else:
           print(f"No correction found for snippet {idx + 1}")
   except Exception as e:
       print(f"Error processing snippet {idx + 1}: {str(e)}")
       results.append({
           'Buggy Code': buggy_code,
           'Buggy Line': buggy_line,
           'Model Generated Correction': 'Error in generation',
           'Original Correct Code': row.get('Correct Code', '')
       })

results_df = pd.DataFrame(results)
output_path = "/kaggle/working/Model_Generated_Corrections_New2.xlsx"
results_df.to_excel(output_path, index=False, sheet_name="Corrections")
print(f"Model generated corrections saved to: {output_path}")

Processing Snippet 1...
Raw Model Response for Snippet 1:
system

You are a code fixing assistant. Your task is to identify and correct bugs in Python code.

Example:
Buggy code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Bug location: if num % i === 0:
Fixed code:
def is_prime(num):
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i === 0:
            return False
    return True

Your task is to carefully analyze the given buggy code, identify the specific problematic line, and generate the correct, optimized version of the code.user

Buggy Code:
def pascal(n):
    rows = [[1]]
    for r in range(1, n):
        row = []
        for c in range(0, r + 1):
            upleft = rows[r - 1][c - 1] if c > 0 else 0
            upright = rows[r - 1][c - 1] if c < r else 0
            row.append(upleft + upright)
      

In [31]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3.2-3b-BugFixer-New"
dataset_name = "/kaggle/input/sample22/Train Data.xlsx"

In [32]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Siddharth-Yadav/llama-3.2-3b-BugFixer-New/commit/e2242def3b77b66f44b2f338325f5a0dd5cb08f1', commit_message='Upload model', commit_description='', oid='e2242def3b77b66f44b2f338325f5a0dd5cb08f1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Siddharth-Yadav/llama-3.2-3b-BugFixer-New', endpoint='https://huggingface.co', repo_type='model', repo_id='Siddharth-Yadav/llama-3.2-3b-BugFixer-New'), pr_revision=None, pr_num=None)