In [None]:

# --- 1. Install Libraries ---
!pip install "unsloth[colab-new]" transformers peft bitsandbytes datasets trl

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import time
import os
from huggingface_hub import login, snapshot_download

# --- 2. HUGGING FACE LOGIN ---
login(token="HF_TOKEN_FOR_THIS_WORKER_GOES_HERE")
print("--- ✅ Hugging Face Login Successful ---")

# --- 3. Load the *BASE* Model ---
max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print("--- ✅ Base model loaded ---")

# --- 4. Add PEFT/LoRA Adapters ---
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 42,
    max_seq_length = max_seq_length,
)

# --- 5. Load and Format Your Dataset ---
full_dataset = load_dataset("ccibeekeoc42/english_to_igbo", split="train")

def format_example(example):
    if 'English' in example and 'Igbo' in example and \
       isinstance(example['English'], str) and isinstance(example['Igbo'], str):
        text = f"<s>[INST] Translate this English sentence to Igbo: '{example['English']}' [/INST] {example['Igbo']}</s>"
        return {"text": text}
    else:
        return {"text": None}

formatted_dataset = full_dataset.map(
    format_example,
    remove_columns=list(full_dataset.features),
    num_proc = 4
)

formatted_dataset = formatted_dataset.filter(lambda example: example.get("text") is not None)
print(f"--- Dataset loaded. Number of examples: {len(formatted_dataset)} ---")


# --- 6. The Training Arguments ---
training_args = TrainingArguments(
    max_steps = 32646,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    optim = "adamw_torch",
    learning_rate = 2e-5,
    lr_scheduler_type = "linear",
    save_strategy = "steps",
    save_steps = 1000,
    save_total_limit = 1,
    push_to_hub = True,
    hub_model_id = "nwokikeonyeka/igbo-phi3-checkpoint",
    hub_strategy = "checkpoint",
    logging_steps = 500,
    fp16 = True,
    group_by_length = True,
    report_to = "none",
)

# --- 7. Initialize the Trainer ---
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
    packing = True,
)

print("--- ✅ All setup is complete! ---")

In [None]:
# Start the "Colab Relay Race" (Worker 1 ONLY)

print(f"--- 🚀 STARTING the 'Colab Relay Race' (Worker 1) ---")
print(f"--- This will run from step 0 and create the first checkpoint. ---")

start_time_train = time.time()

try:
    # Worker 1 just calls .train() to start from scratch
    trainer.train()
    print("\n--- 🎉 TRAINING COMPLETED NORMALLY! ---")

except Exception as e:
    print(f"\n--- 💥 Training interrupted by unexpected error: {e} ---")

finally:
    end_time_train = time.time()
    print(f"--- Training run duration: {(end_time_train - start_time_train) / 60:.2f} minutes ---")
    print("--- 🛑 Session ended. The first checkpoint is safe on Hugging Face Hub. ---")

In [None]:
# Resume Cell (WORKER 2, 3, ...) after colab session timeout

# --- Configuration ---
HUB_MODEL_ID = "nwokikeonyeka/igbo-phi3-checkpoint"
HUB_CHECKPOINT_SUBFOLDER = "last-checkpoint"
LOCAL_CHECKPOINT_PATH = os.path.join(os.path.expanduser("~"), "local_hub_resume")

print(f"--- 👟 RESUMING Training (Worker 2/3/...) ---")
print(f"--- ⬇️ Downloading checkpoint from Hub: {HUB_MODEL_ID}/{HUB_CHECKPOINT_SUBFOLDER} ---")

# --- 1. Download Checkpoint Files Locally ---
snapshot_download(
    repo_id=HUB_MODEL_ID,
    allow_patterns=[f"{HUB_CHECKPOINT_SUBFOLDER}/*"],
    local_dir=LOCAL_CHECKPOINT_PATH,
    local_dir_use_symlinks=False, # Use False for more stability in Colab
)

# --- 2. Define the Local Path to Resume From ---
RESUME_PATH = os.path.join(LOCAL_CHECKPOINT_PATH, HUB_CHECKPOINT_SUBFOLDER)
print(f"--- 🎯 Resuming from LOCAL PATH: {RESUME_PATH} ---")

# --- 3. Run the Training ---
start_time_train = time.time()
try:
    trainer.train(resume_from_checkpoint = RESUME_PATH)
    print("\n--- 🎉 TRAINING COMPLETED NORMALLY! ---")

except Exception as e:
    print(f"\n--- 💥 Training interrupted by unexpected error: {e} ---")

finally:
    end_time_train = time.time()
    print(f"--- Training run duration: {(end_time_train - start_time_train) / 60:.2f} minutes ---")
    print("--- 🛑 Session ended. The latest checkpoint is safe on Hugging Face Hub. ---")

In [None]:
# Save Final GGUF Model

# --- 1. Install Libraries ---
!pip install "unsloth[colab-new]" transformers peft bitsandbytes datasets trl accelerate

import torch
from unsloth import FastLanguageModel
from huggingface_hub import login

# --- 2. HUGGING FACE LOGIN ---
login(token="HF_TOKEN_FOR_THIS_WORKER_GOES_HERE")
print("--- ✅ Hugging Face Login Successful ---")

# --- 3. Load The *TRAINED* Model ---
# This loads our checkpoint repo, which includes the base model
# and the adapter. This is the correct object to use.
max_seq_length = 1024
dtype = None
load_in_4bit = True

print("--- Loading your trained model from 'nwokikeonyeka/igbo-phi3-checkpoint'... ---")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "nwokikeonyeka/igbo-phi3-checkpoint", # Load the checkpoint
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print("--- ✅ Trained model and adapter loaded successfully! ---")

# --- 4. Push the FINAL Merged Model to the Hub ---
# We do NOT call model.merge_and_unload()
# The push_to_hub_gguf function handles the merging internally.

FINAL_MODEL_NAME = "igbo-phi3-translator"

print(f"--- Pushing final GGUF model to 'nwokikeonyeka/{FINAL_MODEL_NAME}'... ---")
print("This creates a small, fast file perfect for the Oracle server.")

# Set the tokenizer's template to the Llama-2 [INST] format we trained on.
tokenizer.chat_template = (
    "{{ bos_token }}<s>[INST] {{ messages[0]['content'] }} [/INST] "
    "{{ messages[1]['content'] }}</s>{{ eos_token }}"
)
print("--- ✅ Llama-2 chat template applied to tokenizer. ---")
# -------------------------

model.push_to_hub_gguf(
    repo_id = f"nwokikeonyeka/{FINAL_MODEL_NAME}",
    tokenizer = tokenizer,
    quantization_method = "q4_k_m"
)

print(f"--- ✅✅✅ FINAL MODEL SAVED TO {FINAL_MODEL_NAME}! ---")