In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [1]:
# canine_lora_sft_train_kaggle.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import CanineTokenizer, CanineModel, get_scheduler
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import random
import os
import shutil # Import for zipping the output

# --- Configuration ---
class SFTConfig:
    # --- CHANGE: Update paths for Kaggle ---
    MODEL_ID = "google/canine-s"
    WORD_LIST_PATH = "/kaggle/input/hangman/words_250000_train.txt" # Assumes your dataset is named 'hangman'
    SAVE_DIR = "/kaggle/working/lora_experts" # Save output to the writable directory
    
    MAX_LENGTH = 32
    MASK_CHAR = '_'
    CHAR_VOCAB = "abcdefghijklmnopqrstuvwxyz"
    char2idx = {c: i for i, c in enumerate(CHAR_VOCAB)}
    
    BATCH_SIZE = 32
    LEARNING_RATE = 5e-4
    MAX_GRAD_NORM = 1.0
    EPOCHS = 8
    DROPOUT_RATE = 0.1
    PATIENCE = 2

config = SFTConfig()
os.makedirs(config.SAVE_DIR, exist_ok=True)

# --- Dataset (Optimized) ---
class HangmanMaskedWordDataset(Dataset):
    def __init__(self, words, tokenizer, mask_range):
        self.words = [w for w in words if 0 < len(w) <= (config.MAX_LENGTH - 2)]
        self.tokenizer = tokenizer
        self.mask_range = mask_range

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words[idx]
        word_len = len(word)
        
        mask_prob = random.uniform(*self.mask_range)
        mask_indices = [i for i in range(word_len) if random.random() < mask_prob]
        if not mask_indices and word_len > 0:
            mask_indices = [random.randint(0, word_len - 1)]

        masked_word = ''.join(
            config.MASK_CHAR if i in mask_indices else ch for i, ch in enumerate(word)
        )

        inputs = self.tokenizer(
            masked_word,
            padding='max_length',
            truncation=True,
            max_length=config.MAX_LENGTH,
            return_tensors="pt"
        )

        labels = torch.full((config.MAX_LENGTH,), -100, dtype=torch.long)
        for i in mask_indices:
            if i < (config.MAX_LENGTH - 1) and word[i] in config.char2idx:
                labels[i + 1] = config.char2idx[word[i]]

        return {
            "input_ids": inputs['input_ids'].squeeze(0),
            "attention_mask": inputs['attention_mask'].squeeze(0),
            "labels": labels
        }

# --- Model ---
class CanineForHangmanSFT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.canine = CanineModel.from_pretrained(config.MODEL_ID)
        self.config = self.canine.config
        
        self.dropout = nn.Dropout(config.DROPOUT_RATE)
        self.cls_head = nn.Linear(self.canine.config.hidden_size, len(config.CHAR_VOCAB))

    def forward(self, input_ids, attention_mask, **kwargs):
        out = self.canine(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(out.last_hidden_state)
        return self.cls_head(x)

# --- Training Function ---
def train_canine_lora(words, expert_name, mask_range, num_epochs):
    print(f"\n=== Training CANINE LoRA Expert: {expert_name} ===")
    
    tokenizer = CanineTokenizer.from_pretrained(config.MODEL_ID)

    train_words, val_words = train_test_split(words, test_size=0.05, random_state=42)
    train_dataset = HangmanMaskedWordDataset(train_words, tokenizer, mask_range)
    val_dataset = HangmanMaskedWordDataset(val_words, tokenizer, mask_range)

    train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    base_model = CanineForHangmanSFT(config)

    peft_config = LoraConfig(
        task_type=TaskType.TOKEN_CLS,
        r=32, 
        lora_alpha=64, 
        lora_dropout=0.1,
        inference_mode=False,
        target_modules=["query", "key", "value", "dense"]
    )

    model = get_peft_model(base_model, peft_config).to(device)
    model.print_trainable_parameters()

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
    scheduler = get_scheduler("linear", optimizer, 0, num_epochs * len(train_loader))
    loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(batch['input_ids'], batch['attention_mask'])
            loss = loss_fn(logits.view(-1, len(config.CHAR_VOCAB)), batch['labels'].view(-1))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(batch['input_ids'], batch['attention_mask'])
                loss = loss_fn(logits.view(-1, len(config.CHAR_VOCAB)), batch['labels'].view(-1))
                total_val_loss += loss.item()
        
        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch+1} | Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            save_path = os.path.join(config.SAVE_DIR, expert_name)
            model.save_pretrained(save_path)
            print(f"✅ Validation loss improved. Saved expert to {save_path}")
        else:
            patience_counter += 1
            if patience_counter >= config.PATIENCE:
                print("⛔ Early stopping triggered.")
                break

# --- Main Execution ---
if __name__ == '__main__':
    with open(config.WORD_LIST_PATH, 'r') as f:
        words = [line.strip().lower() for line in f if line.strip().isalpha()]

    # Train your CANINE experts
    train_canine_lora(words, expert_name="canine_early", mask_range=(0.45, 0.8), num_epochs=config.EPOCHS)
    train_canine_lora(words, expert_name="canine_late", mask_range=(0.1, 0.4), num_epochs=config.EPOCHS)
    
    # --- CHANGE: Zip the final output directory for easy download ---
    print("\nZipping trained experts for download...")
    shutil.make_archive(
        base_name="/kaggle/working/lora_experts", # Name of the zip file
        format='zip',                             # Format
        root_dir="/kaggle/working/",              # Root directory to zip
        base_dir="lora_experts"                   # The specific folder to zip
    )
    print("✅ Finished. You can now download lora_experts.zip from the output directory.")

2025-07-21 16:07:30.177738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753114050.199972     108 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753114050.207256     108 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



=== Training CANINE LoRA Expert: canine_early ===
Using device: cuda
trainable params: 6,242,304 || all params: 138,345,242 || trainable%: 4.5121


Epoch 1/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 1 | Validation Loss: 2.3886
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 2/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 2 | Validation Loss: 2.3458
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 3/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 3 | Validation Loss: 2.3295
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 4/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 4 | Validation Loss: 2.2963
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 5/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 5 | Validation Loss: 2.2826
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 6/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 6 | Validation Loss: 2.2668
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 7/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 7 | Validation Loss: 2.2646
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early


Epoch 8/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 8 | Validation Loss: 2.2474
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_early

=== Training CANINE LoRA Expert: canine_late ===
Using device: cuda
trainable params: 6,242,304 || all params: 138,345,242 || trainable%: 4.5121


Epoch 1/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 1 | Validation Loss: 1.9641
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 2/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 2 | Validation Loss: 1.9183
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 3/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 3 | Validation Loss: 1.8298
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 4/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 5 | Validation Loss: 1.7483
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 6/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 6 | Validation Loss: 1.7172
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 7/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 7 | Validation Loss: 1.6787
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late


Epoch 8/8:   0%|          | 0/6748 [00:00<?, ?it/s]

Epoch 8 | Validation Loss: 1.6741
✅ Validation loss improved. Saved expert to /kaggle/working/lora_experts/canine_late

Zipping trained experts for download...
✅ Finished. You can now download lora_experts.zip from the output directory.
