In [1]:
import torch
from diffusers import FluxPipeline

class LatentHolder:
    def __init__(self):
        self.latents = None

latent_holder = LatentHolder()

class StopDiffusionException(Exception):
    pass

def stop_at_step_callback(pipe, step_index, timestep, callback_kwargs):
    if step_index == stop_at_step - 1:
        print(f"\nStopping at step {step_index + 1} and capturing latents...")
        latent_holder.latents = callback_kwargs['latents']
        raise StopDiffusionException()
    return callback_kwargs

pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", 
    torch_dtype=torch.float16,
    device_map="cuda"
)

prompt = "A cinematic shot of a black hole."

total_steps = 50
stop_at_step = total_steps // 4

print(f"Generation with {total_steps} total steps, will capture latents at step {stop_at_step}.")

try:
    pipe(
        prompt=prompt, 
        num_inference_steps=total_steps,
        callback_on_step_end=stop_at_step_callback,
    )
except StopDiffusionException:
    if latent_holder.latents is not None:
        print("Latents captured successfully.")
        print(f"Latent tensor shape: {latent_holder.latents.shape}")
        print(f"Latent tensor dtype: {latent_holder.latents.dtype}")
    else:
        print("Diffusion was stopped, but no latents were captured.")

  from .autonotebook import tqdm as notebook_tqdm
Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [04:23<00:00, 87.73s/it]s/it]
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]9, 87.27s/it] 
Loading pipeline components...:  43%|████▎     | 3/7 [04:33<06:05, 91.27s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 23.52 GiB of which 22.38 MiB is free. Process 954 has 552.00 MiB memory in use. Including non-PyTorch memory, this process has 22.94 GiB memory in use. Of the allocated memory 22.55 GiB is allocated by PyTorch, and 10.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig

ckpt = "flux1-krea-dev-Q4_K_M.gguf"

class LatentHolder:
    def __init__(self):
        self.latents = None

latent_holder = LatentHolder()

class StopDiffusionException(Exception):
    pass

def stop_at_step_callback(pipe, step_index, timestep, callback_kwargs):
    if step_index == stop_at_step - 1:
        print(f"\nStopping at step {step_index + 1} and capturing latents...")
        latent_holder.latents = callback_kwargs['latents']
        raise StopDiffusionException()
    return callback_kwargs

transformer = FluxTransformer2DModel.from_single_file(
    ckpt,
    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16)
).to('cuda')

pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-Krea-dev",
    transformer=transformer,
    torch_dtype=torch.bfloat16
).to('cuda')

#if torch.cuda.is_available():
#    pipe.enable_sequential_cpu_offload()

prompt = "A cinematic shot of a black hole."

total_steps = 50
stop_at_step = total_steps // 4

print(f"Generation with {total_steps} total steps, will capture latents at step {stop_at_step}.")

try:
    pipe(
        prompt=prompt, 
        num_inference_steps=total_steps,
        callback_on_step_end=stop_at_step_callback,
    )
except StopDiffusionException:
    if latent_holder.latents is not None:
        print("Latents captured successfully.")
        print(f"Latent tensor shape: {latent_holder.latents.shape}")
        print(f"Latent tensor dtype: {latent_holder.latents.dtype}")
    else:
        print("Diffusion was stopped, but no latents were captured.")

  from .autonotebook import tqdm as notebook_tqdm
Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
Loading pipeline components...:  29%|██▊       | 2/7 [00:03<00:08,  1.74s/it]You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading pipeline components...: 100%|██████████| 7/7 [00:04<00:00,  1.58it/s]


Generation with 50 total steps, will capture latents at step 12.


  0%|          | 0/50 [00:04<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x128 and 64x3072)

In [1]:
from diffusers import FluxPipeline, AutoencoderKL
from diffusers.image_processor import VaeImageProcessor
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
import torch
import gc

ckpt_id = "black-forest-labs/FLUX.1-dev"

class LatentHolder:
    def __init__(self):
        self.latents = None

latent_holder = LatentHolder()

class StopDiffusionException(Exception):
    pass

def stop_at_step_callback(pipe, step_index, timestep, callback_kwargs):
    if step_index == stop_at_step - 1:
        print(f"\nStopping at step {step_index + 1} and capturing latents...")
        latent_holder.latents = callback_kwargs['latents']
        raise StopDiffusionException()
    return callback_kwargs

text_encoder = CLIPTextModel.from_pretrained(
    ckpt_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
)
text_encoder_2 = T5EncoderModel.from_pretrained(
    ckpt_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16
)
tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2")

pipeline = FluxPipeline.from_pretrained(
    ckpt_id,
    text_encoder=text_encoder,
    text_encoder_2=text_encoder_2,
    tokenizer=tokenizer,
    tokenizer_2=tokenizer_2,
    transformer=None,
    vae=None,
).to("cuda")
prompt = "A cinematic shot of a black hole."

total_steps = 50
stop_at_step = total_steps // 4

with torch.no_grad():
    print("Encoding prompts.")
    prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
        prompt=prompt, prompt_2=None, max_sequence_length=256
    )

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 162.99it/s]
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00, 4123.38it/s]


Encoding prompts.


In [2]:
def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_peak_memory_stats()

del text_encoder
del text_encoder_2
del tokenizer
del tokenizer_2
del pipeline

flush()

pipeline = FluxPipeline.from_pretrained(
    ckpt_id,
    text_encoder=None,
    text_encoder_2=None,
    tokenizer=None,
    tokenizer_2=None,
    vae=None,
    torch_dtype=torch.bfloat16,
).to("cuda")

Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.22s/it]
Loading pipeline components...: 100%|██████████| 2/2 [00:12<00:00,  6.36s/it]


In [3]:
class LatentHolder:
    def __init__(self):
        self.latents = None

latent_holder = LatentHolder()

class StopDiffusionException(Exception):
    pass

def stop_at_step_callback(pipe, step_index, timestep, callback_kwargs):
    if step_index == stop_at_step - 1:
        print(f"\nStopping at step {step_index + 1} and capturing latents...")
        latent_holder.latents = callback_kwargs['latents']
        raise StopDiffusionException()
    return callback_kwargs

print("Running denoising.")
height, width = 768, 1360
# No need to wrap it up under `torch.no_grad()` as pipeline call method
# is already wrapped under that.
latents = pipeline(
    prompt_embeds=prompt_embeds,
    pooled_prompt_embeds=pooled_prompt_embeds,
    num_inference_steps=50,
    guidance_scale=0.0,
    height=height,
    width=width,
    output_type="latent",
    callback_on_step_end=stop_at_step_callback
).images
print(f"{latents.shape=}")

del pipeline.transformer
del pipeline

flush()

vae = AutoencoderKL.from_pretrained(ckpt_id, revision="refs/pr/1", subfolder="vae", torch_dtype=torch.bfloat16).to(
    "cuda"
)
vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

with torch.no_grad():
    print("Running decoding.")
    latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
    latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor

    image = vae.decode(latents, return_dict=False)[0]
    image = image_processor.postprocess(image, output_type="pil")
    image[0].save("image.png")

Running denoising.


  0%|          | 0/50 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB. GPU 0 has a total capacity of 23.52 GiB of which 22.38 MiB is free. Process 954 has 552.00 MiB memory in use. Including non-PyTorch memory, this process has 22.94 GiB memory in use. Of the allocated memory 22.39 GiB is allocated by PyTorch, and 104.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
from diffusers import FluxPipeline, AutoencoderKL
from diffusers.image_processor import VaeImageProcessor
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
import torch
import gc


def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_peak_memory_stats()


def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024


flush()

ckpt_id = "black-forest-labs/FLUX.1-dev"
prompt = "A cinematic shot of a black hole."

text_encoder = CLIPTextModel.from_pretrained(
    ckpt_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
)
text_encoder_2 = T5EncoderModel.from_pretrained(
    ckpt_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16
)
tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2")

pipeline = FluxPipeline.from_pretrained(
    ckpt_id,
    text_encoder=text_encoder,
    text_encoder_2=text_encoder_2,
    tokenizer=tokenizer,
    tokenizer_2=tokenizer_2,
    transformer=None,
    vae=None,
).to("cuda")

with torch.no_grad():
    print("Encoding prompts.")
    prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
        prompt=prompt, prompt_2=None, max_sequence_length=256
    )

del text_encoder
del text_encoder_2
del tokenizer
del tokenizer_2
del pipeline

flush()

pipeline = FluxPipeline.from_pretrained(
    ckpt_id,
    text_encoder=None,
    text_encoder_2=None,
    tokenizer=None,
    tokenizer_2=None,
    vae=None,
    torch_dtype=torch.bfloat16,
).to("cuda")

print("Running denoising.")
height, width = 768, 1360
# No need to wrap it up under `torch.no_grad()` as pipeline call method
# is already wrapped under that.
latents = pipeline(
    prompt_embeds=prompt_embeds,
    pooled_prompt_embeds=pooled_prompt_embeds,
    num_inference_steps=50,
    guidance_scale=0.0,
    height=height,
    width=width,
    output_type="latent",
    callback_on_step_end=12
).images
print(f"{latents.shape=}")

del pipeline.transformer
del pipeline

flush()

vae = AutoencoderKL.from_pretrained(ckpt_id, revision="refs/pr/1", subfolder="vae", torch_dtype=torch.bfloat16).to(
    "cuda"
)
vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

with torch.no_grad():
    print("Running decoding.")
    latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
    latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor

    image = vae.decode(latents, return_dict=False)[0]
    image = image_processor.postprocess(image, output_type="pil")
    image[0].save("image.png")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.69s/it]
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00, 497.21it/s]


Encoding prompts.


Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.24s/it]
Loading pipeline components...: 100%|██████████| 2/2 [00:12<00:00,  6.39s/it]


Running denoising.


  0%|          | 0/50 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB. GPU 0 has a total capacity of 23.52 GiB of which 22.38 MiB is free. Process 954 has 552.00 MiB memory in use. Including non-PyTorch memory, this process has 22.94 GiB memory in use. Of the allocated memory 22.39 GiB is allocated by PyTorch, and 104.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
import kagglehub
path = kagglehub.dataset_download("mohankrishnathalla/medical-insurance-cost-prediction")

import pandas as pd
data = pd.read_csv(path + "/medical_insurance.csv")
data.loc[1]
data.dropna()

results = []
import numpy as np

X = data.drop('risk_score', axis=1)
y = data['risk_score']
y.dropna(inplace=True)
y.describe()



count    100000.000000
mean          0.519849
std           0.250669
min           0.000000
25%           0.329700
50%           0.505500
75%           0.703300
max           1.000000
Name: risk_score, dtype: float64