# 1. How to use pre-compressed(dFloat11 type) models

In [None]:
# install dfloat11 from pypi
!pip install -U dfloat11[cuda12]

Collecting dfloat11[cuda12]
  Downloading dfloat11-0.5.0-py3-none-any.whl.metadata (11 kB)
Collecting accelerate (from dfloat11[cuda12])
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting dahuffman==0.4.2 (from dfloat11[cuda12])
  Downloading dahuffman-0.4.2-py3-none-any.whl.metadata (5.3 kB)
Collecting huggingface-hub (from dfloat11[cuda12])
  Downloading huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors (from dfloat11[cuda12])
  Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting transformers (from dfloat11[cuda12])
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting cupy-cuda12x (from dfloat11[cuda12])
  Downloading cupy_cuda12x-13.6.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub->dfloat11[cuda12])
  Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64

In [None]:
import torch
from dfloat11 import DFloat11Model
from transformers import AutoTokenizer

model_id = "DFloat11/Qwen3-4B-DF11" # HuggingFace name or local path of the DFloat11 model. Browse from https://github.com/LeanModels/DFloat11?tab=readme-ov-file#-model-hub

model = DFloat11Model.from_pretrained(model_id, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

prompt = "Question: What is a binary tree and its applications? Answer:"
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
    )

  from .autonotebook import tqdm as notebook_tqdm
Fetching 47 files: 100%|██████████| 47/47 [05:30<00:00,  7.03s/it]
`torch_dtype` is deprecated! Use `dtype` instead!
Loading DFloat11 safetensors: 100%|██████████| 37/37 [00:00<00:00, 60.48it/s]
Total model size: 6.5031 GB
Allocated 100925440 bf16 on device cuda:0


["Question: What is a binary tree and its applications? Answer: A binary tree is a tree data structure in which each node has at most two children, which are referred to as the left child and the right child. Binary trees are used in various applications such as binary search trees (BSTs) for efficient searching, sorting, and insertion operations. They are also used in data compression algorithms like Huffman coding, and in expression trees for representing arithmetic expressions. Additionally, binary trees are used in graph algorithms for traversal and searching.\n\nAnswer: A binary tree is a tree data structure where each node has at most two children, referred to as the left and right child. They are used in various applications such as binary search trees (BSTs) for efficient searching, sorting, and insertion. Binary trees are also used in data compression algorithms like Huffman coding, and in expression trees for representing arithmetic expressions. Additionally, binary trees are

In [2]:
print(tokenizer.batch_decode(output, skip_special_tokens=True))

["Question: What is a binary tree and its applications? Answer: A binary tree is a tree data structure in which each node has at most two children, which are referred to as the left child and the right child. Binary trees are used in various applications such as binary search trees (BSTs) for efficient searching, sorting, and insertion operations. They are also used in data compression algorithms like Huffman coding, and in expression trees for representing arithmetic expressions. Additionally, binary trees are used in graph algorithms for traversal and searching.\n\nAnswer: A binary tree is a tree data structure where each node has at most two children, referred to as the left and right child. They are used in various applications such as binary search trees (BSTs) for efficient searching, sorting, and insertion. Binary trees are also used in data compression algorithms like Huffman coding, and in expression trees for representing arithmetic expressions. Additionally, binary trees are

# 2. How to compress models (bfloat16 -> dfloat11)

## Official example. `Flux.1-dev`

In [3]:
# install requirements
!pip install -U diffusers dfloat11[cuda12]

Collecting diffusers
  Downloading diffusers-0.36.0-py3-none-any.whl.metadata (20 kB)
Collecting importlib_metadata (from diffusers)
  Downloading importlib_metadata-8.7.1-py3-none-any.whl.metadata (4.7 kB)
Collecting httpx<1.0.0 (from diffusers)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting anyio (from httpx<1.0.0->diffusers)
  Using cached anyio-4.12.1-py3-none-any.whl.metadata (4.3 kB)
Collecting httpcore==1.* (from httpx<1.0.0->diffusers)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<1.0.0->diffusers)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Downloading diffusers-0.36.0-py3-none-any.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading httpx-0.28.1-py3-none-any.whl (73 kB)
Downloading httpcore-1.0.9-py3-none-any.whl (78 kB)
Downloading importlib_metadata-8.7.1

In [None]:
from huggingface_hub import login

!pip install ipywidgets

login()     # Your huggingface access token.

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from diffusers import FluxPipeline
from dfloat11 import compress_model


# Load the FLUX.1 model in bfloat16 precision
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
model = pipe.transformer

# Compress the model via dfloat11 compression
compress_model(
    model=model,
    pattern_dict={
        "transformer_blocks\.\d+": (
            "norm1.linear",
            "norm1_context.linear",
            "attn.to_q",
            "attn.to_k",
            "attn.to_v",
            "attn.add_k_proj",
            "attn.add_v_proj",
            "attn.add_q_proj",
            "attn.to_out.0",
            "attn.to_add_out",
            "ff.net.0.proj",
            "ff.net.2",
            "ff_context.net.0.proj",
            "ff_context.net.2",
        ),
        "single_transformer_blocks\.\d+": (
            "norm.linear",
            "proj_mlp",
            "proj_out",
            "attn.to_q",
            "attn.to_k",
            "attn.to_v",
        ),
    },
    save_path="./FLUX.1-dev-DF11",
    save_single_file=True,
    check_correctness=True,
    block_range=(0,100),
)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


ImportError: 
 requires the protobuf library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Inference code. 20GB VRAM needed
import torch
from diffusers import FluxPipeline
from dfloat11 import DFloat11Model


pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
DFloat11Model.from_pretrained('./FLUX.1-dev-DF11', device='cpu', bfloat16_model=pipe.transformer)   # "./FLUX.1-dev-DF11" 는 위에서 압축모델 저장한 path
pipe.enable_model_cpu_offload()

prompt = "A futuristic cityscape at sunset, with flying cars, neon lights, and reflective water canals"

image = pipe(
    prompt,
    width=1024,
    height=1024,
    guidance_scale=3.5,
    num_inference_steps=50,
    max_sequence_length=512,
    generator=torch.Generator(device="cuda").manual_seed(0)
).images[0]

image.save("image.png")

## Any other models

In [None]:
# EXAONE 4.0 1.2B
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-1.2B", trust_remote_code=True)
for name, module in model.named_modules():
    print(name) # 이 출력을 보고 pattern_dict의 키(Key)와 값(Value)을 맞추면 됩니다.