In [1]:
# Install dependencies
print("Installing dependencies...")
!pip install -q transformers datasets peft accelerate bitsandbytes trl

Installing dependencies...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Imports
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


In [3]:
# Verify GPU is available
print("\n" + "="*60)
print("GPU Check:")
print("="*60)
if torch.cuda.is_available():
    print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("✗ No GPU found! Go to Runtime → Change runtime type → T4 GPU")
    raise SystemExit


GPU Check:
✓ GPU available: Tesla T4
✓ GPU memory: 15.83 GB


In [6]:
# ============================================================
# CONFIGURATION
# ============================================================
MODEL_NAME = "/content/drive/MyDrive/starcoder2-3b"
OUTPUT_DIR = "./starcoder-finetuned"
DATASET_PATH = "/content/drive/MyDrive/code_dataset.json"  # Upload this file to Colab

# Optional: Mount Google Drive to load dataset from there
# from google.colab import drive
# drive.mount('/content/drive')
# DATASET_PATH = "/content/drive/MyDrive/code_dataset.json"

In [7]:
# ============================================================
# LOAD MODEL & TOKENIZER
# ============================================================
print("\n" + "="*60)
print("Loading tokenizer...")
print("="*60)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print("\nLoading model with 4-bit quantization...")
# 4-bit quantization config for T4 GPU (16GB VRAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare model for LoRA training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False  # Required for gradient checkpointing



Loading tokenizer...

Loading model with 4-bit quantization...


In [8]:
# ============================================================
# CONFIGURE LORA
# ============================================================
print("\nApplying LoRA configuration...")
lora_config = LoraConfig(
    r=16,                          # LoRA rank
    lora_alpha=32,                 # LoRA alpha (scaling)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

print("\n" + "="*60)
print("Model Configuration:")
print("="*60)
model.print_trainable_parameters()


Applying LoRA configuration...

Model Configuration:
trainable params: 9,093,120 || all params: 3,039,464,448 || trainable%: 0.2992


In [9]:
# ============================================================
# LOAD & PREPROCESS DATASET
# ============================================================
print("\n" + "="*60)
print("Loading dataset...")
print("="*60)

# Option 1: Upload file manually
# from google.colab import files
# print("Please upload code_dataset.json:")
# uploaded = files.upload()
# DATASET_PATH = list(uploaded.keys())[0]

try:
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    print(f"✓ Dataset loaded: {len(dataset)} examples")
except FileNotFoundError:
    print(f"✗ Error: {DATASET_PATH} not found!")
    print("Please upload your code_dataset.json file.")
    from google.colab import files
    print("\nUploading file...")
    uploaded = files.upload()
    DATASET_PATH = list(uploaded.keys())[0]
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    print(f"✓ Dataset loaded: {len(dataset)} examples")

# Preview dataset
print("\nDataset structure:")
print(dataset)
print("\nFirst example:")
print(dataset[0])

def preprocess_function(examples):
    """Tokenize the dataset"""
    # Handle different dataset formats
    if "text" in examples:
        texts = examples["text"]
    elif "instruction" in examples and "output" in examples:
        texts = [f"### Instruction:\n{inst}\n\n### Response:\n{out}"
                for inst, out in zip(examples["instruction"], examples["output"])]
    else:
        raise ValueError("Dataset must have 'text' or 'instruction'/'output' fields")

    # Tokenize with padding and truncation
    result = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors=None
    )

    # Add labels for causal language modeling
    result["labels"] = result["input_ids"].copy()

    return result

print("\nTokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing"
)

print(f"✓ Tokenization complete")



Loading dataset...


Generating train split: 0 examples [00:00, ? examples/s]

✓ Dataset loaded: 42 examples

Dataset structure:
Dataset({
    features: ['text', 'file_path', 'file_name', 'language'],
    num_rows: 42
})

First example:
{'text': '// ARKSurvivalEvolved (332.8) SDK\n\n#ifdef _MSC_VER\n\t#pragma pack(push, 0x8)\n#endif\n\n#include "ARKSurvivalEvolved_Buff_PreventDismount_parameters.hpp"\n\nnamespace sdk\n{\n//---\n//Functions\n//---\n\n// Function Buff_PreventDismount.Buff_PreventDismount_C.UserConstructionScript\n// ()\n\nvoid ABuff_PreventDismount_C::UserConstructionScript()\n{\n\tstatic auto fn = UObject::FindObject<UFunction>("Function Buff_PreventDismount.Buff_PreventDismount_C.UserConstructionScript");\n\n\tABuff_PreventDismount_C_UserConstructionScript_Params params;\n\n\tauto flags = fn->FunctionFlags;\n\n\tUObject::ProcessEvent(fn, &params);\n\n\tfn->FunctionFlags = flags;\n}\n\n\n// Function Buff_PreventDismount.Buff_PreventDismount_C.ExecuteUbergraph_Buff_PreventDismount\n// ()\n// Parameters:\n// int                            EntryPoint

Tokenizing:   0%|          | 0/42 [00:00<?, ? examples/s]

✓ Tokenization complete


In [10]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, not masked LM
)

In [11]:
# ============================================================
# TRAINING CONFIGURATION
# ============================================================
print("\n" + "="*60)
print("Setting up training arguments...")
print("="*60)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,      # Batch size per GPU
    gradient_accumulation_steps=4,       # Effective batch size = 4 * 4 = 16
    num_train_epochs=3,                  # Number of epochs
    learning_rate=2e-4,                  # Learning rate
    fp16=True,                           # Mixed precision training
    logging_steps=5,                     # Log every 5 steps
    save_strategy="epoch",               # Save after each epoch
    save_total_limit=2,                  # Keep only 2 checkpoints
    optim="paged_adamw_8bit",           # 8-bit optimizer
    warmup_steps=50,                     # Warmup steps
    lr_scheduler_type="cosine",          # Learning rate scheduler
    gradient_checkpointing=True,         # Save memory
    report_to="none",                    # Disable wandb/tensorboard
    push_to_hub=False,
)




Setting up training arguments...


In [12]:
# ============================================================
# INITIALIZE TRAINER
# ============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [13]:
# ============================================================
# TRAIN MODEL
# ============================================================
print("\n" + "="*60)
print("Starting training...")
print("="*60)
print(f"Total steps: {len(tokenized_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")
print(f"This should take ~10-15 minutes on T4 GPU\n")

# Train
trainer.train()

print("\n" + "="*60)
print("Training complete!")
print("="*60)


Starting training...
Total steps: 6
This should take ~10-15 minutes on T4 GPU



Step,Training Loss
5,2.4623



Training complete!


In [14]:
# ============================================================
# SAVE MODEL
# ============================================================
print("\nSaving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✓ Model saved to {OUTPUT_DIR}")


Saving model...
✓ Model saved to ./starcoder-finetuned


In [15]:
# ============================================================
# TEST THE MODEL
# ============================================================
print("\n" + "="*60)
print("Testing the fine-tuned model...")
print("="*60)

# Load the model for inference
from peft import PeftModel

print("\nLoading fine-tuned model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
finetuned_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)

# Test generation
test_prompts = [
    "def fibonacci(n):",
    "function calculateSum(arr) {",
    "class DataProcessor:",
]

print("\nGenerating code samples:")
print("="*60)

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = finetuned_model.generate(
        **inputs,
        max_length=150,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_code)
    print("-"*60)



Testing the fine-tuned model...

Loading fine-tuned model...

Generating code samples:

Prompt: def fibonacci(n):
def fibonacci(n):
    if n == 1 or n == 0:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

n = int(input("enter a number"))
print(fibonacci(n))/README.md
# python-programs
Python programming

------------------------------------------------------------

Prompt: function calculateSum(arr) {
function calculateSum(arr) {
    var sum = arr.reduce(function (a, b) {
        return a + b;
    }, 0);
    return sum;
}

function calculateAverage(arr) {
    var sum = calculateSum(arr);
    var avg = sum / arr.length;
    return avg;
}

function calculateMedian(arr) {
    var midIndex = Math.floor(arr.length / 2);
    var midValue = arr[midIndex];
    return midValue;
}

function getMode(arr) {
    var modeObj = {};
    var maxCount = 0;
    var maxKey = "";

    arr.forEach(function
------------------------------------------------------------

Prompt: clas

In [16]:
# ============================================================
# DOWNLOAD MODEL
# ============================================================
print("\n" + "="*60)
print("Downloading model...")
print("="*60)

# Zip the model directory
!zip -r starcoder-finetuned.zip {OUTPUT_DIR}

# Download
from google.colab import files
files.download('starcoder-finetuned.zip')

print("\n✓ Complete! Model downloaded as starcoder-finetuned.zip")



Downloading model...
  adding: starcoder-finetuned/ (stored 0%)
  adding: starcoder-finetuned/adapter_config.json (deflated 57%)
  adding: starcoder-finetuned/vocab.json (deflated 57%)
  adding: starcoder-finetuned/merges.txt (deflated 51%)
  adding: starcoder-finetuned/checkpoint-9/ (stored 0%)
  adding: starcoder-finetuned/checkpoint-9/adapter_config.json (deflated 57%)
  adding: starcoder-finetuned/checkpoint-9/vocab.json (deflated 57%)
  adding: starcoder-finetuned/checkpoint-9/merges.txt (deflated 51%)
  adding: starcoder-finetuned/checkpoint-9/rng_state.pth (deflated 26%)
  adding: starcoder-finetuned/checkpoint-9/adapter_model.safetensors (deflated 8%)
  adding: starcoder-finetuned/checkpoint-9/tokenizer_config.json (deflated 90%)
  adding: starcoder-finetuned/checkpoint-9/scaler.pt (deflated 64%)
  adding: starcoder-finetuned/checkpoint-9/special_tokens_map.json (deflated 72%)
  adding: starcoder-finetuned/checkpoint-9/trainer_state.json (deflated 55%)
  adding: starcoder-fine

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✓ Complete! Model downloaded as starcoder-finetuned.zip


In [19]:
# ============================================================
# USAGE INSTRUCTIONS
# ============================================================

#1. Extract starcoder-finetuned.zip

#2. Load and use:

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/starcoder2-3b")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/starcoder2-3b")

# Load LoRA weights
model = PeftModel.from_pretrained(base_model, "./starcoder-finetuned")

# Generate
prompt = "def calculate_fibonacci(n):"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=150, temperature=0.7)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


def calculate_fibonacci(n):
    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)

def calculate_fibonacci_memoized(n):
    memo = [None] * (n+1)
    return calculate_fibonacci_memoized_helper(n, memo)

def calculate_fibonacci_memoized_helper(n, memo):
    if n == 0:
        return 0
    elif n == 1:
        return 1
    elif memo[n] is not None:
        return memo[n]
