# 🤖 Code Review Fine-Tuning Notebook
**Train StarCoder or similar models using CodeXGLUE, Devign, and your own data**

---
✅ Dataset merging
✅ JSONL formatting
✅ Optional Hugging Face upload
✅ Fine-tuning setup

_Generated on: 2025-06-26 08:07_

In [None]:
# 📦 Install dependencies
!pip install datasets transformers accelerate tqdm

In [None]:
# 📥 Load and Combine Datasets
from datasets import load_dataset
import json
from tqdm import tqdm

output_data = []

# Load CodeXGLUE
codeglue = load_dataset("code_x_glue_cc_code_to_text", "python")
for item in tqdm(codeglue['train'].select(range(1000))):
    prompt = f"### Code:\n{item['code']}\n### Review:"
    completion = " " + item['docstring'].strip()
    output_data.append({"prompt": prompt, "completion": completion})

# Load Devign
devign = load_dataset("microsoft/Devign")
for item in tqdm(devign['train'].select(range(1000))):
    label = "Vulnerable" if item['target'] else "Safe"
    prompt = f"### Code:\n{item['func']}\n### Is this vulnerable?"
    completion = f" {label}"
    output_data.append({"prompt": prompt, "completion": completion})

# Add personal data
my_data = [
    {"code": "def hello(): print('hi')", "review": "Add a docstring."}
]
for item in my_data:
    output_data.append({
        "prompt": f"### Code:\n{item['code']}\n### Review:",
        "completion": " " + item['review']
    })

# Save JSONL
with open("fine_tune_data.jsonl", "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print("✅ Dataset ready")

In [None]:
# 🚀 Fine-tune a small StarCoder model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset

model_name = "bigcode/starcoderbase-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

dataset = load_dataset("json", data_files="fine_tune_data.jsonl", split="train")

def tokenize(example):
    return tokenizer(example["prompt"] + example["completion"], truncation=True, max_length=512)

tokenized = dataset.map(tokenize)

args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=50,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()