<a href="https://colab.research.google.com/github/Reubencfernandes/FineTuning-Flux-Dev/blob/main/FLux.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/ostris/ai-toolkit

Cloning into 'ai-toolkit'...
remote: Enumerating objects: 4015, done.[K
remote: Counting objects: 100% (2004/2004), done.[K
remote: Compressing objects: 100% (259/259), done.[K
remote: Total 4015 (delta 1876), reused 1776 (delta 1741), pack-reused 2011 (from 1)[K
Receiving objects: 100% (4015/4015), 29.70 MiB | 43.83 MiB/s, done.
Resolving deltas: 100% (3055/3055), done.


In [None]:
!mkdir -p /content/dataset

In [None]:
!cd ai-toolkit && git submodule update --init --recursive && pip install -r requirements.txt

Submodule 'repositories/batch_annotator' (https://github.com/ostris/batch-annotator) registered for path 'repositories/batch_annotator'
Submodule 'repositories/ipadapter' (https://github.com/tencent-ailab/IP-Adapter.git) registered for path 'repositories/ipadapter'
Submodule 'repositories/leco' (https://github.com/p1atdev/LECO) registered for path 'repositories/leco'
Submodule 'repositories/sd-scripts' (https://github.com/kohya-ss/sd-scripts.git) registered for path 'repositories/sd-scripts'
Cloning into '/content/ai-toolkit/repositories/batch_annotator'...
Cloning into '/content/ai-toolkit/repositories/ipadapter'...
Cloning into '/content/ai-toolkit/repositories/leco'...
Cloning into '/content/ai-toolkit/repositories/sd-scripts'...
Submodule path 'repositories/batch_annotator': checked out '420e142f6ad3cc14b3ea0500affc2c6c7e7544bf'
Submodule 'repositories/controlnet' (https://github.com/lllyasviel/ControlNet-v1-1-nightly.git) registered for path 'repositories/batch_annotator/repositor

In [None]:
import getpass
import os

# Prompt for the token
hf_token = getpass.getpass('Enter your HF access token and press enter: ')

# Set the environment variable
os.environ['HF_TOKEN'] = hf_token

print("HF_TOKEN environment variable has been set.")

Enter your HF access token and press enter: ··········
HF_TOKEN environment variable has been set.


In [None]:
import os
import sys
sys.path.append('/content/ai-toolkit')
from toolkit.job import run_job
from collections import OrderedDict
from PIL import Image
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [None]:
from collections import OrderedDict

job_to_run = OrderedDict([
    ('job', 'extension'),
    ('config', OrderedDict([
        # this name will be the folder and filename name
        ('name', 'my_first_flux_lora_v1'),
        ('process', [
            OrderedDict([
                ('type', 'sd_trainer'),
                # root folder to save training sessions/samples/weights
                ('training_folder', '/content/output'),
                # uncomment to see performance stats in the terminal every N steps
                ('performance_log_every', 1000),
                ('device', 'cuda:0'),
                # if a trigger word is specified, it will be added to captions of training data if it does not already exist
                # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
                # ('trigger_word', 'image'),
                ('network', OrderedDict([
                    ('type', 'lora'),
                    ('linear', 16),
                    ('linear_alpha', 16)
                ])),
                ('save', OrderedDict([
                    ('dtype', 'float16'),  # precision to save
                    ('save_every', 250),  # save every this many steps
                    ('max_step_saves_to_keep', 4)  # how many intermittent saves to keep
                ])),
                ('datasets', [
                    OrderedDict([
                        ('folder_path', '/content/dataset'),
                        ('caption_ext', 'txt'),
                        ('caption_dropout_rate', 0.05),  # will drop out the caption 5% of time
                        ('shuffle_tokens', False),  # shuffle caption order, split by commas
                        ('cache_latents_to_disk', True),  # leave this true unless you know what you're doing
                        ('resolution', [512, 768, 1024])  # flux enjoys multiple resolutions
                    ])
                ]),
                ('train', OrderedDict([
                    ('batch_size', 1),
                    ('steps', 2000),  # total number of steps to train 500 - 4000 is a good range
                    ('gradient_accumulation_steps', 1),
                    ('train_unet', True),
                    ('train_text_encoder', False),  # probably won't work with flux
                    ('content_or_style', 'balanced'),  # content, style, balanced
                    ('gradient_checkpointing', True),  # need the on unless you have a ton of vram
                    ('noise_scheduler', 'flowmatch'),  # for training only
                    ('optimizer', 'adamw8bit'),
                    ('lr', 1e-4),

                    # uncomment this to skip the pre training sample
                    # ('skip_first_sample', True),

                    # uncomment to completely disable sampling
                    # ('disable_sampling', True),

                    # uncomment to use new vell curved weighting. Experimental but may produce better results
                    # ('linear_timesteps', True),

                    # ema will smooth out learning, but could slow it down. Recommended to leave on.
                    ('ema_config', OrderedDict([
                        ('use_ema', True),
                        ('ema_decay', 0.99)
                    ])),

                    # will probably need this if gpu supports it for flux, other dtypes may not work correctly
                    ('dtype', 'bf16')
                ])),
                ('model', OrderedDict([
                    # huggingface model name or path
                    ('name_or_path', 'black-forest-labs/FLUX.1-dev'),
                    ('is_flux', True),
                    ('quantize', True),  # run 8bit mixed precision
                    #('low_vram', True),  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
                ])),
                ('sample', OrderedDict([
                    ('sampler', 'flowmatch'),  # must match train.noise_scheduler
                    ('sample_every', 250),  # sample every this many steps
                    ('width', 1024),
                    ('height', 1024),
                    ('prompts', [
                        # you can add [trigger] to the prompts here and it will be replaced with the trigger word
                        #'[trigger] holding a sign that says \'I LOVE PROMPTS!\'',
                        '[trigger] is standing in a dimly lit street in Japan, wearing a sharp suit and tie, exuding a mysterious and confident aura. The background features glowing storefront signs with Japanese text, a parked car, and a quiet urban atmosphere, perfectly blending modernity with a cinematic noir vibe',
                        'A professional portrait of [trigger], wearing a navy blue suit and red tie. The background features the Indian flag and the Vatican flag, symbolizing an official or governmental setting. The lighting is soft, highlighting [trigger] face and creating a polished, formal appearance. [Trigger] has a confident and calm demeanor.',
                    ]),
                    ('neg', ''),  # not used on flux
                    ('seed', 42),
                    ('walk_seed', True),
                    ('guidance_scale', 4),
                    ('sample_steps', 20)
                ]))
            ])
        ])
    ])),
    # you can add any additional meta info here. [name] is replaced with config name at top
    ('meta', OrderedDict([
        ('name', '[name]'),
        ('version', '1.0')
    ]))
])

In [None]:
run_job(job_to_run)

{
    "type": "sd_trainer",
    "training_folder": "/content/output",
    "performance_log_every": 1000,
    "device": "cuda:0",
    "network": {
        "type": "lora",
        "linear": 16,
        "linear_alpha": 16
    },
    "save": {
        "dtype": "float16",
        "save_every": 250,
        "max_step_saves_to_keep": 4
    },
    "datasets": [
        {
            "folder_path": "/content/dataset",
            "caption_ext": "txt",
            "caption_dropout_rate": 0.05,
            "shuffle_tokens": false,
            "cache_latents_to_disk": true,
            "resolution": [
                512,
                768,
                1024
            ]
        }
    ],
    "train": {
        "batch_size": 1,
        "steps": 2000,
        "gradient_accumulation_steps": 1,
        "train_unet": true,
        "train_text_encoder": false,
        "content_or_style": "balanced",
        "gradient_checkpointing": true,
        "noise_scheduler": "flowmatch",
        "optimizer"

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Quantizing T5
Loading clip
making pipe
preparing
create LoRA network. base dim (rank): 16, alpha: 16
neuron dropout: p=None, rank dropout: p=None, module dropout: p=None
create LoRA for Text Encoder: 0 modules.
create LoRA for U-Net: 494 modules.
enable LoRA for U-Net
Dataset: /content/dataset
  -  Preprocessing image dimensions


100%|██████████| 13/13 [00:00<00:00, 71.60it/s]

  -  Found 13 images
Bucket sizes for /content/dataset:
448x576: 7 files
576x448: 5 files
384x576: 1 files
3 buckets made
Caching latents for /content/dataset
 - Saving latents to disk



Caching latents to disk: 100%|██████████| 13/13 [00:01<00:00,  6.88it/s]


Dataset: /content/dataset
  -  Preprocessing image dimensions


100%|██████████| 13/13 [00:00<00:00, 21862.85it/s]

  -  Found 13 images
Bucket sizes for /content/dataset:
640x832: 7 files
832x640: 4 files
704x512: 1 files
576x832: 1 files
4 buckets made
Caching latents for /content/dataset
 - Saving latents to disk



Caching latents to disk: 100%|██████████| 13/13 [00:01<00:00, 12.85it/s]


Dataset: /content/dataset
  -  Preprocessing image dimensions


100%|██████████| 13/13 [00:00<00:00, 24931.85it/s]

  -  Found 13 images
Bucket sizes for /content/dataset:
640x832: 5 files
1152x832: 4 files
832x1152: 2 files
704x512: 1 files
576x832: 1 files
5 buckets made
Caching latents for /content/dataset
 - Saving latents to disk



Caching latents to disk: 100%|██████████| 13/13 [00:00<00:00, 15.72it/s]


Generating baseline samples before training


my_first_flux_lora_v1:  12%|█▏        | 249/2000 [08:49<59:02,  2.02s/it, lr: 1.0e-04 loss: 4.822e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.80s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.76s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.73s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.73s/it][A
my_first_flux_lora_v1:  12%|█▏        | 249/2000 [08:49<59:02,  2.02s/it, lr: 1.0e-04 loss: 4.822e-01]

Saving at step 250


my_first_flux_lora_v1:  12%|█▏        | 249/2000 [08:51<59:02,  2.02s/it, lr: 1.0e-04 loss: 4.822e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt


my_first_flux_lora_v1:  25%|██▍       | 499/2000 [17:36<47:02,  1.88s/it, lr: 1.0e-04 loss: 4.260e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.78s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.75s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.73s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.72s/it][A
my_first_flux_lora_v1:  25%|██▍       | 499/2000 [17:36<47:02,  1.88s/it, lr: 1.0e-04 loss: 4.260e-01]

Saving at step 500


my_first_flux_lora_v1:  25%|██▍       | 499/2000 [17:38<47:02,  1.88s/it, lr: 1.0e-04 loss: 4.260e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt


my_first_flux_lora_v1:  37%|███▋      | 749/2000 [26:21<44:01,  2.11s/it, lr: 1.0e-04 loss: 3.849e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.77s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.73s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.72s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.70s/it][A
my_first_flux_lora_v1:  37%|███▋      | 749/2000 [26:21<44:01,  2.11s/it, lr: 1.0e-04 loss: 3.849e-01]

Saving at step 750


my_first_flux_lora_v1:  37%|███▋      | 749/2000 [26:24<44:01,  2.11s/it, lr: 1.0e-04 loss: 3.849e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt


my_first_flux_lora_v1:  50%|████▉     | 999/2000 [35:07<34:19,  2.06s/it, lr: 1.0e-04 loss: 5.727e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.81s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.75s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.74s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.73s/it][A
my_first_flux_lora_v1:  50%|████▉     | 999/2000 [35:07<34:19,  2.06s/it, lr: 1.0e-04 loss: 5.727e-01]

Saving at step 1000


my_first_flux_lora_v1:  50%|████▉     | 999/2000 [35:10<34:19,  2.06s/it, lr: 1.0e-04 loss: 5.727e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt

Timer 'my_first_flux_lora_v1 Timer':
 - 2.1755s avg - train_loop, num = 10
 - 1.2697s avg - backward, num = 10
 - 0.6455s avg - predict_unet, num = 10
 - 0.2669s avg - reset_batch, num = 10
 - 0.1079s avg - optimizer_step, num = 10
 - 0.0682s avg - encode_prompt, num = 10
 - 0.0616s avg - calculate_loss, num = 10
 - 0.0023s avg - get_batch, num = 10
 - 0.0016s avg - preprocess_batch, num = 10
 - 0.0011s avg - prepare_noise, num = 10
 - 0.0005s avg - batch_cleanup, num = 10
 - 0.0003s avg - prepare_latents, num = 10
 - 0.0000s avg - scheduler_step, num = 10
 - 0.0000s avg - grad_setup, num = 10
 - 0.0000s avg - prepare_prompt, num = 10
 - 0.0000s avg - log_to_tensorboard, num = 10



my_first_flux_lora_v1:  62%|██████▏   | 1249/2000 [43:54<26:44,  2.14s/it, lr: 1.0e-04 loss: 4.723e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.80s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.75s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.74s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.73s/it][A
my_first_flux_lora_v1:  62%|██████▏   | 1249/2000 [43:54<26:44,  2.14s/it, lr: 1.0e-04 loss: 4.723e-01]

Saving at step 1250


my_first_flux_lora_v1:  62%|██████▏   | 1249/2000 [43:56<26:44,  2.14s/it, lr: 1.0e-04 loss: 4.723e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt
Removing old save: /content/output/my_first_flux_lora_v1/my_first_flux_lora_v1_000000250.safetensors


my_first_flux_lora_v1:  75%|███████▍  | 1499/2000 [52:43<17:04,  2.05s/it, lr: 1.0e-04 loss: 4.338e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.78s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.75s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.73s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.73s/it][A
my_first_flux_lora_v1:  75%|███████▍  | 1499/2000 [52:43<17:04,  2.05s/it, lr: 1.0e-04 loss: 4.338e-01]

Saving at step 1500


my_first_flux_lora_v1:  75%|███████▍  | 1499/2000 [52:45<17:04,  2.05s/it, lr: 1.0e-04 loss: 4.338e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt
Removing old save: /content/output/my_first_flux_lora_v1/my_first_flux_lora_v1_000000500.safetensors


my_first_flux_lora_v1:  87%|████████▋ | 1749/2000 [1:01:30<09:15,  2.21s/it, lr: 1.0e-04 loss: 4.383e-01]
Generating Images:   0%|          | 0/4 [00:00<?, ?it/s][A
Generating Images:  25%|██▌       | 1/4 [00:18<00:56, 18.77s/it][A
Generating Images:  50%|█████     | 2/4 [00:37<00:37, 18.74s/it][A
Generating Images:  75%|███████▌  | 3/4 [00:56<00:18, 18.73s/it][A
Generating Images: 100%|██████████| 4/4 [01:14<00:00, 18.72s/it][A
my_first_flux_lora_v1:  87%|████████▋ | 1749/2000 [1:01:30<09:15,  2.21s/it, lr: 1.0e-04 loss: 4.383e-01]

Saving at step 1750


my_first_flux_lora_v1:  87%|████████▋ | 1749/2000 [1:01:32<09:15,  2.21s/it, lr: 1.0e-04 loss: 4.383e-01]

Saved to /content/output/my_first_flux_lora_v1/optimizer.pt
Removing old save: /content/output/my_first_flux_lora_v1/my_first_flux_lora_v1_000000750.safetensors


my_first_flux_lora_v1: 100%|█████████▉| 1999/2000 [1:10:15<00:02,  2.11s/it, lr: 1.0e-04 loss: 5.631e-01]



Saved to /content/output/my_first_flux_lora_v1/optimizer.pt


In [None]:

!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

In [32]:
# Install required libraries
!pip install transformers safetensors pyyaml

# Import libraries
import yaml
import json
import os
from transformers import AutoModel, AutoTokenizer
from safetensors.torch import load_file
import torch

# Paths
model_dir = "/content/output/my_first_flux_lora_v1"
config_yaml_path = f"{model_dir}/config.yaml"
config_json_path = f"{model_dir}/config.json"
checkpoint_file = f"{model_dir}/my_first_flux_lora_v1_000001750.safetensors"

# Convert config.yaml to config.json
with open(config_yaml_path, "r") as yaml_file:
    config_data = yaml.safe_load(yaml_file)

# Create a Hugging Face-compatible JSON configuration
hf_config = {
    "model_type": "flux",  # Specify the type of your model; adjust as needed
    "name_or_path": config_data["config"]["process"][0]["model"]["name_or_path"],
    "quantize": config_data["config"]["process"][0]["model"]["quantize"],
    "use_ema": config_data["config"]["process"][0]["train"]["ema_config"]["use_ema"],
    "ema_decay": config_data["config"]["process"][0]["train"]["ema_config"]["ema_decay"],
    "dtype": config_data["config"]["process"][0]["train"]["dtype"],
    "training_steps": config_data["config"]["process"][0]["train"]["steps"],
}

# Save as config.json
with open(config_json_path, "w") as json_file:
    json.dump(hf_config, json_file, indent=4)
print(f"Converted config.yaml to {config_json_path}")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print("Tokenizer loaded successfully.")

# Load the model
print("Loading model...")
model = AutoModel.from_pretrained(model_dir, low_cpu_mem_usage=True)
state_dict = load_file(checkpoint_file)  # Load the fine-tuned checkpoint
model.load_state_dict(state_dict, strict=False)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
print("Model loaded successfully.")

# Example: Run inference
input_text = "Describe a serene morning by the lake."
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

print("Inference complete. Outputs:", outputs)


Converted config.yaml to /content/output/my_first_flux_lora_v1/config.json


ValueError: The checkpoint you are trying to load has model type `flux` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.