# Fine-tuning DeepSeek 6.7B

# Install Dependencies

In [3]:
!pip uninstall -y transformers accelerate peft bitsandbytes datasets trl scipy triton
!pip install --upgrade transformers==4.41.2 -q
!pip install --upgrade peft==0.11.1 -q
!pip install --upgrade accelerate==0.30.1 -q
!pip install bitsandbytes -q
!pip install --upgrade datasets==2.19.1 -q
!pip install --upgrade trl==0.8.6 -q
!pip install --upgrade scipy -q
!pip install --upgrade triton -q

# Attempting to install bitsandbytes from a potentially more compatible source
!pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui

Found existing installation: transformers 4.53.0
Uninstalling transformers-4.53.0:
  Successfully uninstalled transformers-4.53.0
Found existing installation: accelerate 1.8.1
Uninstalling accelerate-1.8.1:
  Successfully uninstalled accelerate-1.8.1
Found existing installation: peft 0.15.2
Uninstalling peft-0.15.2:
  Successfully uninstalled peft-0.15.2
[0mFound existing installation: datasets 2.14.4
Uninstalling datasets-2.14.4:
  Successfully uninstalled datasets-2.14.4
[0mFound existing installation: scipy 1.15.3
Uninstalling scipy-1.15.3:
  Successfully uninstalled scipy-1.15.3
Found existing installation: triton 3.2.0
Uninstalling triton-3.2.0:
  Successfully uninstalled triton-3.2.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m129.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
import os
os.kill(os.getpid(), 9)

# Configuration and Model Selection

In [None]:
# Model - CodeGen 2B mono 
MODEL_NAME = "Salesforce/codegen-2B-mono"  # Mono = Python-focused

# Dataset 
DATASET_NAME = "sahil2801/CodeAlpaca-20k"
DATASET_SPLIT = "train[:1500]"   

# Training parameters optimized for CodeGen
LORA_R = 16  # Good for 2B model
BATCH_SIZE = 1
SEQ_LENGTH = 768  # CodeGen handles up to 2048, but 768 is efficient
EPOCHS = 1

print(f"CodeGen-2B Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Dataset: {DATASET_NAME}")
print(f"  Samples: {DATASET_SPLIT}")
print(f"  LoRA Rank: {LORA_R}")
print(f"  Sequence Length: {SEQ_LENGTH}")

CodeGen-2B Configuration:
  Model: Salesforce/codegen-2B-mono
  Dataset: sahil2801/CodeAlpaca-20k
  Samples: train[:1500]
  LoRA Rank: 16
  Sequence Length: 768


# Imports and GPU Check

In [2]:
import torch
import gc
import time
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# Clear Memory & Check GPU
torch.cuda.empty_cache()
gc.collect()

if torch.cuda.is_available():
    print(f"--- GPU Information ---")
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"Current Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"-----------------------")
else:
    print("No GPU detected. This will be very slow.")
    raise RuntimeError("GPU required for 7B model fine-tuning")

# Memory tracking function
def print_gpu_memory(stage=""):
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"\n--- GPU Memory {stage} ---")
    print(f"Allocated: {allocated:.2f} GB")
    print(f"Reserved: {reserved:.2f} GB")
    print(f"--------------------------\n")

--- GPU Information ---
GPU Detected: Tesla T4
Total Memory: 14.74 GB
Current Usage: 0.00 GB
-----------------------


# Configure Quantization

In [None]:
# QLoRA config optimized for CodeGen-2B
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16  # float16 for CodeGen
)

print("Quantization configured for CodeGen-2B")

✅ Quantization configured for CodeGen-2B


# Load Model and Tokenizer

In [None]:
print(f"\n Loading {MODEL_NAME}...")
start_load_time = time.time()

# Load tokenizer - CodeGen uses GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# CodeGen specific: ensure padding token is set
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # CodeGen prefers left padding

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,
    torch_dtype=torch.float16,  # Specify dtype
)

print(f"CodeGen-2B loaded successfully in {time.time() - start_load_time:.1f} seconds!")
print_gpu_memory("After Model Loading")


🔄 Loading Salesforce/codegen-2B-mono...
✅ CodeGen-2B loaded successfully in 22.9 seconds!

--- GPU Memory After Model Loading ---
Allocated: 1.84 GB
Reserved: 1.85 GB
--------------------------



# Prepare Model for Training

In [5]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

print("Model prepared for training.")

Model prepared for training.


# Configure LoRA

In [None]:
if "codegen" in MODEL_NAME.lower():
    # CodeGen uses these module names
    target_modules = [
        "transformer.h.*.attn.q_proj",
        "transformer.h.*.attn.v_proj",
        # Alternative names that might work:
        # "attn.q_proj",
        # "attn.v_proj",
        # "transformer.h.*.attn.c_attn",  # Combined attention
    ]

    # Simpler approach - just use the attention projection
    target_modules = ["attn.c_attn", "attn.c_proj"]

    # OR even simpler - let PEFT find them
    target_modules = "all-linear"  # This targets all linear layers

print(f"Target modules for LoRA: {target_modules}")

# LoRA configuration
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules,  # Use the correct modules
)

Target modules for LoRA: all-linear


In [None]:
# Apply LoRA
model = get_peft_model(model, peft_config)

# Enable gradient checkpointing after LoRA
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

# Print trainable parameters
trainable_params, total_params = model.get_nb_trainable_parameters()
print(f"\n LoRA Applied to CodeGen!")
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")


✅ LoRA Applied to CodeGen!
Trainable parameters: 20,971,520 (0.75%)


# Load and Prepare Dataset

In [None]:
# dataset formatting
def format_code_dataset_codegen(example):
    """
    Format for CodeGen models - they work best with simple prompts
    """
    # CodeGen prefers this format for Python code
    if example.get('input', '').strip():
        # With input context
        text = f"""# Question: {example['instruction']}
# Input: {example['input']}
# Solution:
{example['output']}"""
    else:
        # Without input - most common case
        text = f"""# {example['instruction']}
{example['output']}"""

    return {"text": text}

# Load and format dataset
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

# Apply CodeGen-specific formatting
dataset = dataset.map(format_code_dataset_codegen)

# Filter for appropriate length
def filter_length(example):
    # More conservative for CodeGen
    return len(example['text']) < SEQ_LENGTH * 3

dataset = dataset.filter(filter_length)

# Split dataset
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f" Dataset prepared for CodeGen!")
print(f"   Training samples: {len(train_dataset)}")
print(f"   Evaluation samples: {len(eval_dataset)}")
print(f"\n Sample formatted text:")
print(train_dataset[0]["text"][:500] + "...")

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

✅ Dataset prepared for CodeGen!
   Training samples: 1350
   Evaluation samples: 150

📝 Sample formatted text:
# Question: Create a C# program which takes two strings as parameters, and returns true if the two strings are anagrams of each other.
# Input: 'listen', 'silent'
# Solution:
bool AreAnagrams(string s1, string s2)
{
    if (s1.Length != s2.Length)
        return false;
 
    int[] a = new int[256];
    int n = s1.Length;
    for (int i = 0; i < n; i++)
    {
        int c1 = (int)s1[i];
        int c2 = (int)s2[i];
 
        a[c1]++;
        a[c2]--;
    }
 
    for (int i = 0; i < 256; i++)
   ...


# Configure Training Arguments

In [None]:
# Optimized training arguments for CodeGen-2B
training_args = TrainingArguments(
    output_dir="./codegen_2b_results",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_32bit",  # 32bit works well for 2B model
    logging_strategy="steps",
    logging_steps=25,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    learning_rate=5e-5,  # Lower LR for CodeGen
    warmup_steps=100,    # More warmup for stability
    fp16=True,           # fp16 for CodeGen
    bf16=False,
    max_grad_norm=1.0,   # Higher for stability
    lr_scheduler_type="cosine",
    report_to=[],
    group_by_length=True,
    ddp_find_unused_parameters=False,
    dataloader_pin_memory=False,
)

print("Training arguments configured for CodeGen-2B")

✅ Training arguments configured for CodeGen-2B




# Create Trainer

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Preprocessing function
def preprocess_function(examples):
    # Tokenize
    result = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=SEQ_LENGTH
    )
    # Set labels
    result["labels"] = result["input_ids"].copy()
    return result

# Tokenize datasets
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["text"]
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

# Apply PEFT configuration to the model
model = get_peft_model(model, peft_config)
print("PEFT model created:")
model.print_trainable_parameters()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

print("Trainer created successfully")

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

PEFT model created:
trainable params: 20,971,520 || all params: 2,800,327,680 || trainable%: 0.7489
✅ Trainer created successfully


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


# Training with Monitoring

In [13]:
print("\nStarting training...")
print("=" * 60)

# Clear cache before training
torch.cuda.empty_cache()
gc.collect()

# Record start time
start_time = time.time()

try:
    # Train the model
    train_result = trainer.train()

    # Calculate training statistics
    training_time = (time.time() - start_time) / 60
    final_loss = train_result.training_loss
    samples_per_second = len(train_dataset) / (time.time() - start_time)

    print("\nTraining completed successfully!")
    print(f"\n--- Training Statistics ---")
    print(f"Total time: {training_time:.1f} minutes")
    print(f"Final loss: {final_loss:.4f}")
    print(f"Samples/second: {samples_per_second:.2f}")

    # Check if we should scale up
    max_memory = torch.cuda.max_memory_allocated() / 1024**3
    print(f"Max GPU memory: {max_memory:.2f} GB")
    print(f"---------------------------")

    if max_memory < 13 and final_loss < 2.0:
        print(f"\nSuccess: Model trained well with headroom.")
        print(f"Next step: Increase dataset to 500 samples")
    elif max_memory > 14:
        print(f"\nWarning: Memory usage high!")
        print(f"Next step: Keep current settings or reduce LoRA rank")

except Exception as e:
    print(f"\nError: Training failed with error: {e}")
    print(f"Suggestions:")
    print(f"- Reduce LoRA rank to 4")
    print(f"- Reduce Sequence length to 256")
    print(f"- Reduce Dataset size to 200")

print_gpu_memory("After Training")


Starting training...


Step,Training Loss,Validation Loss
100,0.9404,0.82845
200,0.9358,0.786682
300,0.8968,0.772017



Training completed successfully!

--- Training Statistics ---
Total time: 49.4 minutes
Final loss: 1.0475
Samples/second: 0.46
Max GPU memory: 6.10 GB
---------------------------

Success: Model trained well with headroom.
Next step: Increase dataset to 500 samples

--- GPU Memory After Training ---
Allocated: 2.49 GB
Reserved: 3.60 GB
--------------------------



# Save the Model

In [14]:
# Save the fine-tuned model
if 'train_result' in locals():
    print("\nSaving model...")

    save_path = f"./{MODEL_NAME.split('/')[-1]}-finetuned"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model saved to: {save_path}")

    # Calculate adapter size
    import os
    adapter_size = sum(os.path.getsize(os.path.join(save_path, f))
                      for f in os.listdir(save_path)
                      if f.endswith('.bin') or f.endswith('.safetensors')) / 1024**2

    print(f"Adapter size: {adapter_size:.1f} MB")


Saving model...
Model saved to: ./codegen-2B-mono-finetuned
Adapter size: 80.0 MB


In [None]:
# Add this at the end of your training notebook
model.save_pretrained("./codegen_finetuned_lora")
tokenizer.save_pretrained("./codegen_finetuned_lora")
print("Model saved for optimization experiments!")

✅ Model saved for optimization experiments!


# Test the Fine-tuned Model

In [None]:
print("\n Testing CodeGen-2B for Python code...")
model.config.use_cache = True

# CodeGen-specific test prompts
test_prompts = [
    "# Write a function to calculate fibonacci numbers\n",
    "# Create a function that reverses a string\n",
    "# Implement bubble sort in Python\n",
    "# Write a class for a stack data structure\n",
]

prompt = test_prompts[0]
print(f"Prompt: {prompt}")
print("-" * 60)

# Tokenize with CodeGen settings
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    return_attention_mask=True
).to("cuda")

# Generate with CodeGen-optimized parameters
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.2,      # Low for code
        do_sample=True,
        top_p=0.95,
        repetition_penalty=1.0,  # No penalty for code
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:")
print(response)


🧪 Testing CodeGen-2B for Python code...
Prompt: # Write a function to calculate fibonacci numbers

------------------------------------------------------------
Generated Code:
# Write a function to calculate fibonacci numbers
def fibonacci(n):
    if n <= 1:
        return n
    else:
        return fibonacci(n-1) + fibonacci(n-2)

print(fibonacci(10))

# Write a function to calculate the sum of the first n natural numbers
def sum_natural_numbers(n):
    if n <= 1:
        return n
    else:
        return sum_natural_numbers(n-1) + n

print(sum_natural_numbers(10))

# Write a function to calculate the sum of the first n natural numbers
def sum_natural_numbers(n


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
print("Google Drive mounted successfully!")

Mounted at /content/drive
✅ Google Drive mounted successfully!


In [None]:
import shutil
import os

# Define the source directory  
# Make sure this path matches the path used in the saving step  
source_dir = "./codegen_finetuned_lora" # Or "./codegen-2B-mono-finetuned"

# Define the destination directory in your Google Drive
# Create a new folder name or use an existing one
destination_dir = "./drive/My Drive/my_finetuned_codegen_model"

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

print(f"Copying model from '{source_dir}' to '{destination_dir}'...")

# Copy the contents of the source directory to the destination directory
try:
    # Use copytree for directories
    if os.path.isdir(source_dir):
        # Remove destination if it exists to avoid errors with copytree
        if os.path.exists(destination_dir):
            shutil.rmtree(destination_dir)
        shutil.copytree(source_dir, destination_dir)
    else:
        # Use copy2 for single files if needed (though models are usually directories)
        shutil.copy2(source_dir, destination_dir)
    print(" Model successfully copied to Google Drive!")

except Exception as e:
    print(f"Error copying model: {e}")

Copying model from './codegen_finetuned_lora' to './drive/My Drive/my_finetuned_codegen_model'...
✅ Model successfully copied to Google Drive!
