# Preparation

In [1]:
"""
# Download model
huggingface-cli download tiiuae/Falcon3-1B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit
# Compile
python setup_env.py -md ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit -q i2_s

#.build/bin/llama-server -m /home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf --host 127.0.0.1 --port 8080

# Download Falcon3-1B-Instruct
huggingface-cli download tiiuae/Falcon3-1B-Instruct --local-dir ~/models/tiiuae/Falcon3-1B-Instruct
"""

'\n# Download model\nhuggingface-cli download tiiuae/Falcon3-1B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit\n# Compile\npython setup_env.py -md ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit -q i2_s\n\n#.build/bin/llama-server -m /home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf --host 127.0.0.1 --port 8080\n\n# Download Falcon3-1B-Instruct\nhuggingface-cli download tiiuae/Falcon3-1B-Instruct --local-dir ~/models/tiiuae/Falcon3-1B-Instruct\n'

# Importing Libraries

In [2]:
import os
from dataclasses import dataclass
import torch
from transformers import AutoModelForCausalLM
from utils import set_seed
from speculative_decoding import BitNet
#from evaluate import LlmEvaluator

# Configuration

In [3]:
@dataclass
class CONFIG:
    # Debug
    debug: bool = False
    verbose: bool = True

    # Model
    ## Tokenizer
    tokenizer_id: str = "tiiuae/Falcon3-1B-Instruct"
    ## HuggingFace
    model_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct"  # 1B
    four_bit_path: str = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct-GPTQ-Int4" # 1B (4bit)
    model_large_path: str = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct" # 3B
    ## GGUF (1bit)
    bitnet_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf"

    ctx_size: int = 1024

    # Generation
    max_new_tokens: int = 25
    ## Speculative Decoding
    num_assistant_tokens: int = 5
    assistant_confidence_threshold: float = 0.4

    # Device
    n_threads: int = 12

    # Seed
    seed = 42

config = CONFIG()

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
set_seed(config.seed)

Random seed set to 42


In [6]:
system_prompt = "You are an helpful assistant."
user_prompt = "Explain quantum mechanics in detail please."
assistant_response = "Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons."

# Model

In [7]:
bitnet = BitNet()
bitnet.start_server(
    bitnet_path=config.bitnet_path,
    ctx_size=config.ctx_size,
    n_threads=config.n_threads,
    verbose=config.verbose
)
bitnet.init_tokenizer(
    tokenizer_id=config.tokenizer_id,
    verbose=False
)
bitnet.init_model(
    model_path=config.model_path,
    verbose=True
)

🚀 Starting llama-server on 127.0.0.1:8080
✅ Server is ready.


The following generation flags are not valid and may be ignored: ['output_attentions']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(131072, 2048)
    (layers): ModuleList(
      (0-17): 18 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-06)
    (

In [8]:
text = bitnet.format_falcon_prompt(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    assistant_response=assistant_response
)
print(text)

<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons.


In [9]:
# 32bit-2bit
bitnet.generate_hf(
    assistant_model=AutoModelForCausalLM.from_pretrained(
        config.four_bit_path,
        device_map="cuda",
        output_attentions=True,
    ),
    text=text,
    max_new_tokens=10,
    num_assistant_tokens=5,
    assistant_confidence_threshold=0.25,
    stream=True,
    verbose=True
)

  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons. It is a complex and counterintuitive theory that has been

--- outputs 객체 디버깅 정보 ---
outputs 타입: <class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>
outputs 속성: ['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__si

' It is a complex and counterintuitive theory that has been'

# Generation

In [10]:
# 1B (bf16)
bitnet.generate_hf(
    text=text,
    max_new_tokens=config.max_new_tokens,
    stream=True,
    verbose=config.verbose
)

<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons. It is a complex and counterintuitive theory that has been developed over the past two centuries, and it has revolutionized our understanding of the

--- outputs 객체 디버깅 정보 ---
outputs 타입: <class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>
outputs 속성: ['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '

' It is a complex and counterintuitive theory that has been developed over the past two centuries, and it has revolutionized our understanding of the'

In [11]:
# 1B (1bit)
bitnet.generate_gguf(
    text=text,
    max_new_tokens=config.max_new_tokens,
    verbose=config.verbose
)


[95m──────────────────────────────────────────────────
🧠 Generation Info (BitNet GGUF)
──────────────────────────────────────────────────[0m
[94m💬 User Input:[0m
<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons.

[92m🟢 Generated Text:[0m
 At its core, quantum mechanics is based on the idea that not all electromagnetic fields can be described by classical mechanics through classical fields

[94m📊 Timings:[0m
├─ Prefill: 2.12 ms/token, 470.98 tokens/s
└─ Decode: 27.56 ms/token, 36.29 tokens/s
[94m📦 Tokens:[0m
├─ Prefilled: 64
└─ Decoded: 25
[94m🛑 Stop Reason:[0m Limit

[95m──────────────────────────────────────────────────
💡 Token Probabilities
──────────────────────────────────────────────────[0m
| Step | Token           |  Prob

' At its core, quantum mechanics is based on the idea that not all electromagnetic fields can be described by classical mechanics through classical fields'

## Speculative Decoding

In [12]:
# 3B-1B
bitnet.speculative_decoding_hf(
    large_model=AutoModelForCausalLM.from_pretrained(
        config.model_large_path,
        device_map="cpu",
        dtype=torch.float32
    ),
    text=text,
    max_new_tokens=10,
    num_assistant_tokens=5,
    assistant_confidence_threshold=0.25,
    stream=True,
    verbose=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and 



RuntimeError: Expected all tensors to be on the same device, but got mat2 is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA_bmm)

In [None]:
# 32bit-1bit
bitnet.speculative_decoding(
    text=text,
    max_new_tokens=100,
    num_assistant_tokens=5,
    confidence_threshold=0.25,
    verbose=True
)

# Evaluation

## HellaSwag