# N8N Workflow Generator - Colab Training Notebook

This notebook is optimized for Google Colab. It will:
- Check GPU availability
- Install required packages
- Upload your dataset from local or Google Drive
- Train the model with a VS Code-friendly progress bar
- Save checkpoints and final model to `/content/`

**How to use:**
1. Upload your dataset (use the upload cell or mount Google Drive)
2. Run all cells in order
3. Monitor progress in the output and `/content/training_progress.log`
4. Download the final model from the Files sidebar

In [None]:
# Check GPU availability
!nvidia-smi || echo 'No GPU found'

In [None]:
# Install required packages
!pip install -q transformers datasets peft accelerate bitsandbytes scipy trl torch tqdm

In [None]:
# Upload dataset from local
from google.colab import files
uploaded = files.upload()
dataset_path = list(uploaded.keys())[0]  # Use the first uploaded file

In [None]:
# Load your dataset
import json
from datasets import Dataset
formatted_data = []
with open(dataset_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            item = json.loads(line.strip())
            workflow_str = json.dumps(item['workflow']) if isinstance(item['workflow'], dict) else item['workflow']
            formatted_data.append({
                'text': f'''<|system|>
You are an n8n workflow generator. Convert natural language descriptions into valid n8n workflow JSON.
<|user|>
{item['prompt']}
<|assistant|>
{workflow_str}'''
            })
train_dataset = Dataset.from_list(formatted_data)
print(f'Loaded {len(train_dataset)} examples')

In [None]:
# Load model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map='auto', trust_remote_code=True)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
print('Model loaded!')

In [None]:
# Configure LoRA adapters
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], lora_dropout=0.05, bias='none', task_type='CAUSAL_LM')
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='/content/n8n-workflow-generator',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    save_strategy='steps',
    save_steps=25,
    save_total_limit=3,
    logging_steps=5,
    warmup_steps=100,
    optim='paged_adamw_8bit',
    max_grad_norm=0.3,
    lr_scheduler_type='cosine',
    report_to='none',
    logging_first_step=True,
    disable_tqdm=False,
    gradient_checkpointing=True,
)
print('Training arguments configured!')

In [None]:
# Training cell with VS Code-friendly progress bar and log file
from transformers import Trainer, DataCollatorForLanguageModeling, TrainerCallback
import time
import os
class TextProgressCallback(TrainerCallback):
    def __init__(self, total_steps, epochs, log_path):
        self.total_steps = total_steps
        self.epochs = epochs
        self.start_time = None
        self.last_logged_step = -1
        self.log_path = log_path
        if os.path.exists(log_path):
            os.remove(log_path)
    def _progress_bar(self, current, total, width=40):
        filled = int(width * current / max(1, total))
        return '█' * filled + '░' * (width - filled)
    def _log(self, msg):
        with open(self.log_path, 'a', encoding='utf-8') as f:
            f.write(msg + '
')
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self._log('TRAINING STARTED')
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
        current_step = state.global_step
        if current_step == self.last_logged_step:
        self.last_logged_step = current_step
        elapsed = time.time() - self.start_time
        total = self.total_steps if self.total_steps else max(1, current_step)
        progress_pct = (current_step / total) * 100
        if current_step > 0:
        bar = self._progress_bar(current_step, total)
        loss = logs.get('loss')
        msg = f'Step {current_step}/{total} ({progress_pct:.1f}%) | Loss: {loss:.4f} | Elapsed: {elapsed/60:.1f} min | ETA: {eta_minutes:.1f} min'
        print(f'
{'='*80}')
        print(f'📊 {msg}')
        print(f'[{bar}]')
        print(f
)
        self._log(msg)
    def on_save(self, args, state, control, **kwargs):
        elapsed = time.time() - self.start_time if self.start_time else 0
        ckpt = f'checkpoint-{state.global_step}'
        msg = f'Checkpoint saved: {ckpt} | Elapsed: {elapsed/60:.1f} min'
        print(msg)
        self._log(msg)
    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.start_time if self.start_time else 0
        msg = f'TRAINING COMPLETE! Total Time: {total_time/60:.1f} min'
        print(msg)
        self._log(msg)
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=2048, padding='max_length')
print('Tokenizing dataset...')
tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
checkpoint_dir = '/content/n8n-workflow-generator'
resume_from_checkpoint = None
if os.path.exists(checkpoint_dir):
    checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith('checkpoint-')]
    if checkpoints:
total_steps = len(train_dataset) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[TextProgressCallback(total_steps, training_args.num_train_epochs, '/content/training_progress.log')],
)
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

In [None]:
# Save final model
output_dir = '/content/n8n-workflow-generator-final'
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f'Model saved to {output_dir}')