# Fine-tuning Mistral 7B

# Install Dependencies

In [None]:
!pip uninstall -y transformers accelerate peft bitsandbytes datasets trl scipy triton
!pip install --upgrade transformers==4.41.2 -q
!pip install --upgrade peft==0.11.1 -q
!pip install --upgrade accelerate==0.30.1 -q
!pip install bitsandbytes -q # Removing explicit version to try a different installation method below
!pip install --upgrade datasets==2.19.1 -q
!pip install --upgrade trl==0.8.6 -q
!pip install --upgrade scipy -q
!pip install --upgrade triton -q # Let pip handle the triton version based on torch

# Attempting to install bitsandbytes from a potentially more compatible source
!pip install bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui

In [None]:
import os
os.kill(os.getpid(), 9)

# Configuration and Model Selection

In [None]:
# Model selection (uncomment one)
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
# MODEL_NAME = "NousResearch/Llama-2-7b-hf"
# MODEL_NAME = "microsoft/phi-2"

# Dataset configuration
DATASET_NAME = "timdettmers/openassistant-guanaco"
DATASET_SPLIT = "train[:300]"

# Training parameters (conservative for 7B)
LORA_R = 8
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 1

print(f"--- Configuration ---")
print(f"Model: {MODEL_NAME}")
print(f"Dataset: {DATASET_NAME}")
print(f"Samples: {DATASET_SPLIT}")
print(f"LoRA Rank: {LORA_R}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"-----------------------")

# Imports and GPU Check

In [None]:
import torch
import gc
import time
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# Clear Memory & Check GPU
torch.cuda.empty_cache()
gc.collect()

if torch.cuda.is_available():
    print(f"--- GPU Information ---")
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
    print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"Current Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"-----------------------")
else:
    print("No GPU detected. This will be very slow.")
    raise RuntimeError("GPU required for 7B model fine-tuning")

# Memory tracking function
def print_gpu_memory(stage=""):
    allocated = torch.cuda.memory_allocated() / 1024**3
    reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"\n--- GPU Memory {stage} ---")
    print(f"Allocated: {allocated:.2f} GB")
    print(f"Reserved: {reserved:.2f} GB")
    print(f"--------------------------\n")

# Configure Quantization

In [None]:
# QLoRA config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Quantization configured for 4-bit loading.")

# Load Model and Tokenizer

In [None]:
print(f"\nLoading {MODEL_NAME}...")
start_load_time = time.time()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with 4-bit quantization
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False,
    )
    print(f"Model loaded successfully in {time.time() - start_load_time:.1f} seconds.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to Phi-2...")
    MODEL_NAME = "microsoft/phi-2"
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False,
    )

print_gpu_memory("After Model Loading")

# Prepare Model for Training

In [None]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

print("Model prepared for training.")

# Configure LoRA

In [None]:
# Determine target modules based on model type
if "mistral" in MODEL_NAME.lower():
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
elif "llama" in MODEL_NAME.lower():
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
elif "phi" in MODEL_NAME.lower():
    target_modules = ["q_proj", "k_proj", "v_proj", "dense"]
else:
    # Default modules
    target_modules = ["q_proj", "k_proj", "v_proj"]

print(f"Target modules for LoRA: {target_modules}")

# LoRA configuration
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_R * 2,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=target_modules
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Print trainable parameters
trainable_params, total_params = model.get_nb_trainable_parameters()
print(f"\nLoRA Applied Successfully!")
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"Total parameters: {total_params:,}")

print_gpu_memory("After LoRA Application")

# Load and Prepare Dataset

In [None]:
print(f"\nLoading dataset: {DATASET_NAME}")
print(f"Using samples: {DATASET_SPLIT}")

# Load dataset
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

# Format function for the dataset
def format_dataset(example):
    # Handle different dataset formats
    if "instruction" in example and "output" in example:
        # Instruction-following format
        text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    elif "human" in example and "assistant" in example:
        # Conversational format
        text = f"### Human: {example['human']}\n### Assistant: {example['assistant']}"
    elif "text" in example:
        # Plain text format
        text = example["text"]
    else:
        # Fallback
        text = str(example)

    return {"text": text}

# Apply formatting
dataset = dataset.map(format_dataset)

# Split into train and validation
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"Dataset prepared.")
print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")
print(f"\nSample text preview:")
print(train_dataset[0]["text"][:500] + "...")

# Configure Training Arguments

In [None]:
# Calculate gradient accumulation steps to maintain effective batch size
effective_batch_size = 8
gradient_accumulation_steps = effective_batch_size // BATCH_SIZE

print(f"--- Training Configuration ---")
print(f"Per device batch size: {BATCH_SIZE}")
print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
print(f"Effective batch size: {effective_batch_size}")
print(f"------------------------------")

training_args = TrainingArguments(
    output_dir="./results_7b",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_8bit",
    logging_strategy="steps",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="no",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to=[],
    group_by_length=True,
    ddp_find_unused_parameters=False,
    dataloader_pin_memory=False,
)

print("\nTraining arguments configured.")

# Create Trainer

In [None]:
# Create SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=SEQ_LENGTH,
    packing=False,
)

print("Trainer created successfully.")
print_gpu_memory("Before Training")

# Training with Monitoring

In [None]:
print("\nStarting training...")
print("=" * 60)

# Clear cache before training
torch.cuda.empty_cache()
gc.collect()

# Record start time
start_time = time.time()

try:
    # Train the model
    train_result = trainer.train()

    # Calculate training statistics
    training_time = (time.time() - start_time) / 60
    final_loss = train_result.training_loss
    samples_per_second = len(train_dataset) / (time.time() - start_time)

    print("\nTraining completed successfully!")
    print(f"\n--- Training Statistics ---")
    print(f"Total time: {training_time:.1f} minutes")
    print(f"Final loss: {final_loss:.4f}")
    print(f"Samples/second: {samples_per_second:.2f}")

    # Check if we should scale up
    max_memory = torch.cuda.max_memory_allocated() / 1024**3
    print(f"Max GPU memory: {max_memory:.2f} GB")
    print(f"---------------------------")

    if max_memory < 13 and final_loss < 2.0:
        print(f"\nSuccess: Model trained well with headroom.")
        print(f"Next step: Increase dataset to 500 samples")
    elif max_memory > 14:
        print(f"\nWarning: Memory usage high!")
        print(f"Next step: Keep current settings or reduce LoRA rank")

except Exception as e:
    print(f"\nError: Training failed with error: {e}")
    print(f"Suggestions:")
    print(f"- Reduce LoRA rank to 4")
    print(f"- Reduce Sequence length to 256")
    print(f"- Reduce Dataset size to 200")

print_gpu_memory("After Training")

# Save the Model

In [None]:
# Save the fine-tuned model
if 'train_result' in locals():
    print("\nSaving model...")

    save_path = f"./{MODEL_NAME.split('/')[-1]}-finetuned"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model saved to: {save_path}")

    # Calculate adapter size
    import os
    adapter_size = sum(os.path.getsize(os.path.join(save_path, f))
                      for f in os.listdir(save_path)
                      if f.endswith('.bin') or f.endswith('.safetensors')) / 1024**2

    print(f"Adapter size: {adapter_size:.1f} MB")

# Test the Fine-tuned Model

In [None]:
print("\nTesting the fine-tuned model...")
model.config.use_cache = True

# Test prompts
test_prompts = [
    "### Instruction:\nExplain quantum computing in simple terms.\n\n### Response:\n",
    "### Instruction:\nWrite a Python function to calculate fibonacci numbers.\n\n### Response:\n",
    "### Instruction:\nWhat are the benefits of exercise?\n\n### Response:\n"
]

# Select prompt based on model performance
if 'final_loss' in locals() and final_loss < 1.5:
    prompt = test_prompts[1]
else:
    prompt = test_prompts[0]

print(f"Prompt: {prompt}")
print("-" * 60)

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to("cuda")

# Generate
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text = response[len(prompt):]

print("Generated Response:")
print(generated_text)

# Experiment Summary and Next Steps

In [None]:
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)

# Create summary
summary = {
    "Model": MODEL_NAME,
    "Parameters": "7B" if "7b" in MODEL_NAME.lower() else "2.7B",
    "Dataset Size": DATASET_SPLIT,
    "LoRA Rank": LORA_R,
    "Training Time": f"{training_time:.1f} min" if 'training_time' in locals() else "N/A",
    "Final Loss": f"{final_loss:.4f}" if 'final_loss' in locals() else "N/A",
    "Max GPU Memory": f"{max_memory:.2f} GB" if 'max_memory' in locals() else "N/A",
    "Status": "Success" if 'train_result' in locals() else "Failed"
}

for key, value in summary.items():
    print(f"{key:.<20} {value}")

print("\nNEXT STEPS:")
if 'train_result' in locals() and max_memory < 13:
    print("1. Increase dataset size to 500 samples")
    print("2. Try LoRA rank 16 for better quality")
    print("3. Experiment with different datasets")
    print("\nUpdate these values in Cell 2:")
    print('   DATASET_SPLIT = "train[:500]"')
    print('   LORA_R = 16')
elif 'max_memory' in locals() and max_memory > 13:
    print("1. Memory is tight, optimize further:")
    print("2. Set LORA_R = 4")
    print("3. Set SEQ_LENGTH = 256")
    print("4. Keep dataset at 300 samples")
else:
    print("1. Debug the error")
    print("2. Try with Phi-2 model first")
    print("3. Reduce all parameters")

print("\nPro tip: Save this notebook with results before next experiment!")

In [None]:
# Quick configuration switcher for multiple experiments
CONFIGS = {
    "conservative_7b": {
        "MODEL_NAME": "mistralai/Mistral-7B-v0.1",
        "DATASET_SPLIT": "train[:300]",
        "LORA_R": 8,
        "BATCH_SIZE": 1,
        "SEQ_LENGTH": 512
    },
    "scaled_7b": {
        "MODEL_NAME": "mistralai/Mistral-7B-v0.1",
        "DATASET_SPLIT": "train[:500]",
        "LORA_R": 8,
        "BATCH_SIZE": 1,
        "SEQ_LENGTH": 512
    },
    "memory_optimized": {
        "MODEL_NAME": "mistralai/Mistral-7B-v0.1",
        "DATASET_SPLIT": "train[:200]",
        "LORA_R": 4,
        "BATCH_SIZE": 1,
        "SEQ_LENGTH": 256
    },
    "fallback_phi2": {
        "MODEL_NAME": "microsoft/phi-2",
        "DATASET_SPLIT": "train[:2000]",
        "LORA_R": 16,
        "BATCH_SIZE": 2,
        "SEQ_LENGTH": 512
    }
}

# To use: Copy the config you want to Cell 2
print("Available configurations:")
for name, config in CONFIGS.items():
    print(f"\n{name}:")
    for key, value in config.items():
        print(f"  {key} = {value}")