# StarCoder2-7B LoRA Finetuning with Alpaca-Style Python Codebase

In [None]:
!pip install -qU transformers peft bitsandbytes datasets accelerate trl torch

In [None]:
!huggingface-cli login

In [None]:
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
import json
from typing import List, Dict

print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

## Step 1: Load Open-Source Python Dataset

In [None]:
from datasets import load_dataset

print("Loading datasets...")

datasets_to_load = [
    ("codeparrot/github-code-clean", "Python"),
    ("heegyu/MATH23K", None),
]

all_data = []

try:
    print("\n1. Loading GitHub Code (Python)...")
    github_ds = load_dataset("codeparrot/github-code-clean", "Python", split="train", streaming=True)
    github_data = []
    for i, sample in enumerate(github_ds):
        if i >= 1000:
            break
        if sample.get('code'):
            all_data.append({
                "instruction": "Complete the following Python code:",
                "input": "",
                "output": sample['code'][:2000]
            })
    print(f"   Loaded {len(github_data)} samples")
except Exception as e:
    print(f"   Error: {e}")

try:
    print("\n2. Loading Stack Overflow Q&A Dataset...")
    stackoverflow_ds = load_dataset("HuggingFaceH4/stack-exchange-qa", split="train[:5000]")
    for sample in stackoverflow_ds:
        if "python" in sample.get('tags', '').lower():
            all_data.append({
                "instruction": sample['title'][:200],
                "input": sample['question'][:1000],
                "output": sample['answer'][:2000]
            })
    print(f"   Loaded {len(stackoverflow_ds)} Q&A samples")
except Exception as e:
    print(f"   Error: {e}")

try:
    print("\n3. Loading CodeSearchNet (Python)...")
    codesearchnet_ds = load_dataset("code_search_net", "python", split="train[:3000]", trust_remote_code=True)
    for sample in codesearchnet_ds:
        if sample.get('code'):
            all_data.append({
                "instruction": f"Implement: {sample.get('func_name', 'function')}",
                "input": sample.get('docstring', '')[:500],
                "output": sample['code'][:2000]
            })
    print(f"   Loaded CodeSearchNet samples")
except Exception as e:
    print(f"   Error: {e}")

try:
    print("\n4. Loading The Stack Dataset (Python)...")
    stack_ds = load_dataset("bigcode/the-stack", "data", split="train[:2000]", streaming=True, trust_remote_code=True)
    count = 0
    for sample in stack_ds:
        if sample.get('ext') == '.py' and sample.get('content'):
            all_data.append({
                "instruction": "Complete the following Python code:",
                "input": "",
                "output": sample['content'][:2000]
            })
            count += 1
            if count >= 1000:
                break
    print(f"   Loaded {count} samples")
except Exception as e:
    print(f"   Error: {e}")

try:
    print("\n5. Loading HumanEval-Instruct...")
    humaneval_ds = load_dataset("TIGER-Lab/HumanEval-Instruct", split="train")
    for sample in humaneval_ds:
        all_data.append({
            "instruction": sample.get('prompt', '')[:200],
            "input": "",
            "output": sample.get('canonical_solution', '')[:2000]
        })
    print(f"   Loaded {len(humaneval_ds)} samples")
except Exception as e:
    print(f"   Error: {e}")

df = pd.DataFrame(all_data)
df = df.drop_duplicates(subset=['output'])
print(f"\n\nTotal unique samples: {len(df)}")
df.head()

## Step 2: Format Data in Alpaca Style

In [None]:
def create_alpaca_prompt(instruction: str, input_text: str, output: str) -> str:
    if input_text:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
{output}"""

df['text'] = df.apply(
    lambda row: create_alpaca_prompt(row['instruction'], row['input'], row['output']),
    axis=1
)

print("Sample formatted prompt:")
print(df['text'].iloc[0])
print("\n" + "="*80 + "\n")

## Step 3: Create Dataset

In [None]:
train_dataset = Dataset.from_pandas(df[['text']])

split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)

print(f"Train samples: {len(split_dataset['train'])}")
print(f"Test samples: {len(split_dataset['test'])}")

## Step 4: Load Model with 4-bit Quantization

In [None]:
model_id = "bigcode/starcoder2-7b"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map="auto"
)

print("Model loaded successfully!")
print(f"Model size: {model.get_memory_footprint() / 1e9:.2f} GB")

## Step 5: Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("LoRA configured successfully!")

## Step 6: Setup Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./starcoder2-7b-lora-python",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=100,
    num_train_epochs=3,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=5,
    save_steps=50,
    eval_steps=25,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    optim="paged_adamw_8bit",
    report_to="none",
    seed=42,
    dataloader_pin_memory=True,
)

print("Training arguments configured!")

## Step 7: Initialize SFT Trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=2048,
    dataset_text_field="text",
    packing=True,
)

print("Trainer initialized!")

## Step 8: Start Training

In [None]:
print("Starting training...")
train_result = trainer.train()
print(f"\nTraining completed!")
print(f"Final loss: {train_result.training_loss}")

## Step 9: Save Model

In [None]:
trainer.save_model("./starcoder2-7b-lora-python-final")
print("Model saved to ./starcoder2-7b-lora-python-final")

In [None]:

# trainer.push_to_hub("hemanthnov2001/starcoder2-7b-lora-python")
# print("Model pushed to HuggingFace Hub")

## Step 11: Test Fine-tuned Model

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "./starcoder2-7b-lora-python-final",
    device_map="auto",
    torch_dtype=torch.float16
)

def test_model(instruction: str, input_text: str = "") -> str:
    if input_text:
        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""
    else:
        prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.2, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

test_prompt = "Write a function to find the factorial of a number"
print(f"Instruction: {test_prompt}")
print(f"\nResponse:\n{test_model(test_prompt)}")