In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv("key.env")
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")
login(hf_api_key)

## Base Model

In [None]:
from diffusers import StableDiffusion3Pipeline
import torch

model_id = "stabilityai/stable-diffusion-3.5-medium"
pipe = StableDiffusion3Pipeline.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16
)
pipe.to("cuda")

Fetching 26 files: 100%|██████████| 26/26 [00:17<00:00,  1.49it/s]
Loading pipeline components...:  22%|██▏       | 2/9 [00:03<00:11,  1.69s/it]You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading pipeline components...:  56%|█████▌    | 5/9 [00:06<00:05,  1.33s/it]
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
Loading checkpoint shards:  50%|█████     | 1/2 [00:04<00:04,  4.99s/it][A
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.89s/it][A
Loading pipeline components...: 100%|██████████| 9/9 [00:29<00:00,  3.24s/it]


StableDiffusion3Pipeline {
  "_class_name": "StableDiffusion3Pipeline",
  "_diffusers_version": "0.35.0.dev0",
  "_name_or_path": "stabilityai/stable-diffusion-3.5-medium",
  "feature_extractor": [
    null,
    null
  ],
  "image_encoder": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "FlowMatchEulerDiscreteScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "text_encoder_2": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "text_encoder_3": [
    "transformers",
    "T5EncoderModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_2": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_3": [
    "transformers",
    "T5TokenizerFast"
  ],
  "transformer": [
    "diffusers",
    "SD3Transformer2DModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

## Finetuned Model

In [None]:
import torch
from diffusers import DiffusionPipeline

model_id = 'stabilityai/stable-diffusion-3.5-medium'
adapter_id = 'mingyu-oo/stable-diffusion-3.5-medium-HC'
pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16) # loading directly in bf16
pipeline.load_lora_weights(adapter_id)

prompt = "designed by Hyundai, front 4 by 3 view, long sleek silhouette, aggressive LED headlamps, sculpted hood, parametric grille pattern, dynamic side character lines, floating roof, frameless windows, flush door handles, wide stance, concept lighting, premium metallic blue finish, high-tech minimalism"
negative_prompt = (
    "cartoon, illustration, sketch, anime, cgi, 3d render, "
    "side view, rear view, top view, back view, cropped, truncated, incomplete, out of frame, "
    "deformed, extra wheels, extra doors, text, watermark, logo, "
    "outdoor, street, landscape, colored background, "
    "shadow, reflection, frame, border, blurry, low quality"
)

## Optional: quantise the model to save on vram.
## Note: The model was not quantised during training, so it is not necessary to quantise it during inference time.
#from optimum.quanto import quantize, freeze, qint8
#quantize(pipeline.transformer, weights=qint8)
#freeze(pipeline.transformer)
    
pipeline.to('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu') # the pipeline is already in its target precision level
model_output = pipeline(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=20,
    generator=torch.Generator(device='cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu').manual_seed(42),
    width=1024,
    height=1024,
    guidance_scale=7.5,
).images[0]

model_output.save("output.png", format="PNG")