Packages installation

In [None]:
%%capture
import torch

# 1. Install Unsloth (Optimized for Colab)
# We allow unsloth to handle the dependency resolution for Colab
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# 2. Install/Upgrade Xformers and TRL
# We force these upgrades to ensure compatibility with the T4 GPU on Colab
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# 3. Verify Pytorch & GPU
print(f"Pytorch Version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: No GPU detected. Please check Runtime settings.")

Loading the Model (Mistral7B)

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("SUCCESS: Model loaded without errors!")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Mistral patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
SUCCESS: Model loaded without errors!


Loading and Formatting the Data

In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template
import os

# 1. Define the specific path
dataset_path = "unsloth_training_data.json"

# 2. Verify the file exists before trying to load it
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"File not found at: {dataset_path}\nPlease upload 'unsloth_training_data.json' to the Colab Files tab on the left.")

print(f"Found dataset at: {dataset_path}")

# 3. Load the JSON file
dataset = load_dataset("json", data_files=dataset_path, split="train")

# 4. Apply Mistral Chat Formatting
# This requires 'tokenizer' to be defined from the previous Step 2 block
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "mistral",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return { "text" : texts }

# Apply formatting to the entire dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

print("SUCCESS: Data loaded and formatted!")
print(f"Sample Input:\n{dataset['text'][0][:200]}...")

Found dataset at: /content/unsloth_training_data.json
SUCCESS: Data loaded and formatted!
Sample Input:
<s>[INST] Schema: {
  "db_id": "department_management",
  "collection_names": [
    "head",
    "management",
    "department"
  ],
  "column_names": [
    [
      0,
      "_id"
    ],
    [
      0,...


Setting the Hyperparameters for LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

print("LoRA Adapters attached successfully.")

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


LoRA Adapters attached successfully.


Model Fine-Tuning

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
import torch
from google.colab import drive
import os

# 1. Mount Google Drive (So we can save safely)
drive.mount('/content/drive')

# 2. Define a Safe Path on Drive
# Checkpoints will be saved to your Drive, so they persist even if Colab crashes.
safe_output_dir = "/content/drive/MyDrive/DocSpider_Checkpoints"

# Create the folder if it doesn't exist
os.makedirs(safe_output_dir, exist_ok=True)

# Clear GPU cache
torch.cuda.empty_cache()

print(f"Starting FULL TRAINING...")
print(f"Checkpoints will be saved to: {safe_output_dir}")
print("If the session crashes, you can resume from there!")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False,
    processing_class = tokenizer,

    args = TrainingArguments(
        # --- Memory Settings ---
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        gradient_checkpointing = True,

        # --- Training Settings ---
        num_train_epochs = 1,

        # --- SAFE CHECKPOINTING ---
        save_strategy = "steps",
        save_steps = 100,            # Save every 100 steps (more frequent = safer)
        output_dir = safe_output_dir, # <--- SAVES TO GOOGLE DRIVE
        save_total_limit = 2,        # Only keep the last 2 checkpoints to save Drive space

        # --- Optimizer ---
        warmup_steps = 50,
        learning_rate = 1e-4,
        lr_scheduler_type = "linear",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        seed = 3407,
    ),
)

# --- RESUME LOGIC ---
# If you crash, change this to True. For the first run, keep it False.
trainer_stats = trainer.train(resume_from_checkpoint = False)

print("FULL TRAINING COMPLETE!")

Mounted at /content/drive
Starting FULL TRAINING...
Checkpoints will be saved to: /content/drive/MyDrive/DocSpider_Checkpoints
If the session crashes, you can resume from there!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,043 | Num Epochs = 1 | Total steps = 506
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080 of 7,331,909,632 (1.14% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,0.266
20,0.2181
30,0.2208
40,0.23
50,0.217
60,0.2291
70,0.2369
80,0.2014
90,0.2073
100,0.1911




Saving the Fine-Tuned Model

In [None]:
import os
import shutil
from google.colab import drive

# 1. Mount Drive (to access your saved checkpoints)
drive.mount('/content/drive')

# 2. Define Paths
checkpoint_folder = "/content/drive/MyDrive/DocSpider_Checkpoints"
final_destination = "/content/drive/MyDrive/DocSpider_Mistral_Final"

# 3. Find the latest checkpoint (e.g., checkpoint-500)
if not os.path.exists(checkpoint_folder):
    raise FileNotFoundError("Could not find checkpoints folder! Did you unmount Drive?")

# Get all folders starting with 'checkpoint-'
checkpoints = [d for d in os.listdir(checkpoint_folder) if d.startswith("checkpoint-")]

if not checkpoints:
    raise FileNotFoundError("No checkpoints found! Training might have failed before step 100.")

# Sort to find the highest number (latest step)
# This logic sorts 'checkpoint-100', 'checkpoint-500' correctly
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[1]))
latest_path = os.path.join(checkpoint_folder, latest_checkpoint)

print(f"âœ… Found latest saved model: {latest_checkpoint}")

# 4. Copy it to the Final Folder
print(f"Copying {latest_checkpoint} to {final_destination}...")

if os.path.exists(final_destination):
    shutil.rmtree(final_destination) # Remove old version if exists

shutil.copytree(latest_path, final_destination)

print("SUCCESS: Your training is complete and saved!")
print(f"You can now load your model from: {final_destination}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… Found latest saved model: checkpoint-400
Copying checkpoint-400 to /content/drive/MyDrive/DocSpider_Mistral_Final...
SUCCESS: Your training is complete and saved!
You can now load your model from: /content/drive/MyDrive/DocSpider_Mistral_Final
