# Preparation

In [1]:
"""
# Download model
huggingface-cli download tiiuae/Falcon3-1B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit
# Compile
python setup_env.py -md ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit -q i2_s

#.build/bin/llama-server -m /home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf --host 127.0.0.1 --port 8080

# Download Falcon3-1B-Instruct
huggingface-cli download tiiuae/Falcon3-1B-Instruct --local-dir ~/models/tiiuae/Falcon3-1B-Instruct
"""

'\n# Download model\nhuggingface-cli download tiiuae/Falcon3-1B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit\n# Compile\npython setup_env.py -md ~/models/tiiuae/Falcon3-1B-Instruct-1.58bit -q i2_s\n\n#.build/bin/llama-server -m /home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf --host 127.0.0.1 --port 8080\n\n# Download Falcon3-1B-Instruct\nhuggingface-cli download tiiuae/Falcon3-1B-Instruct --local-dir ~/models/tiiuae/Falcon3-1B-Instruct\n'

# Importing Libraries

In [2]:
import os
from dataclasses import dataclass
from utils import set_seed, BitNet
#from speculative_decoding import generate_draft_response, speculative_decoding, verify_with_target



# Configuration

In [3]:
@dataclass
class CONFIG:
    # Debug
    debug: bool = False
    verbose: bool = True

    # Model
    model_id: str = "tiiuae/Falcon3-1B-Instruct"
    model_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-f32.gguf"
    quantized_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct-1.58bit/ggml-model-i2_s.gguf"
    ctx_size: int = 1024

    # Generation
    max_new_tokens: int = 256
    ## Speculative Decoding
    num_assistant_tokens: int = 5
    assistant_confidence_threshold: float = 0.4

    # Device
    n_threads: int = 12

    # Seed
    seed = 42

config = CONFIG()

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = "cpu"

In [5]:
set_seed(config.seed)

Random seed set to 42


In [6]:
system_prompt = "You are an helpful assistant."
user_prompt = "Explain quantum mechanics in detail please."
assistant_response = "Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons."

# Model

In [7]:
bitnet = BitNet(
    model_id=config.model_id,
    quantized_path=config.quantized_path,
    host="127.0.0.1",
    port=8080,
    ctx_size=config.ctx_size,
    n_threads=config.n_threads,
    n_gpu_layers=0,
    batch_size=1,
    slot_id=1
)
bitnet.start_server(verbose=False)
bitnet.init_tokenizer(verbose=False)
bitnet.init_model(verbose=True)

🚀 Starting llama-server on 127.0.0.1:8080
✅ Server is ready.
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(131072, 2048)
    (layers): ModuleList(
      (0-17): 18 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
   

# Generation

In [8]:
bitnet.generate_hf(
    text=bitnet.format_falcon_prompt(
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        assistant_response=assistant_response
    ),
    max_new_tokens=10,
    verbose=True
)

The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons. It is a complex and counterintuitive theory that has been

[95m──────────────────────────────────────────────────
🧠 Generation Info (Hugging Face)
──────────────────────────────────────────────────[0m
[94m💬 User Input:[0m
<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons.

[92m🟢 Generated Text:[0m
 It is a complex and counterintuitive theory that has been

[94m📊 Timings:[0m
  - Total Time: 3.41s
  - Decode: 341.33 ms/token, 2.93 tokens/s
[94m

' It is a complex and counterintuitive theory that has been'

In [9]:
bitnet.generate_gguf(
    text=bitnet.format_falcon_prompt(
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        assistant_response=assistant_response
    ),
    max_new_tokens=100,
    verbose=True
)


[95m──────────────────────────────────────────────────
🧠 Generation Info
──────────────────────────────────────────────────[0m
[94m💬 User Input:[0m
<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons.

[92m🟢 Generated Text:[0m
 At its core, quantum mechanics is based on the idea that not all electromagnetic fields can be described by classical mechanics through classical fields like electromagnetic tension and dipole moments. Instead, these fields are described by the concept of quantum fields, and these fields can be generated by pairs of entangled particles. The light color of the particle contributes to the field's quantum state, and the system’s qubit relies on superposition to maintain this quantum state over time to change states.



" At its core, quantum mechanics is based on the idea that not all electromagnetic fields can be described by classical mechanics through classical fields like electromagnetic tension and dipole moments. Instead, these fields are described by the concept of quantum fields, and these fields can be generated by pairs of entangled particles. The light color of the particle contributes to the field's quantum state, and the system’s qubit relies on superposition to maintain this quantum state over time to change states."

## Speculative Decoding

In [12]:
bitnet.verify_hf(
    text=bitnet.format_falcon_prompt(
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        assistant_response=assistant_response
    ),
    num_verify=10,
    confidence_threshold=0.4,
    verbose=True
)


[95m──────────────────────────────────────────────────
🔍 Verification Info (Hugging Face)
──────────────────────────────────────────────────[0m
[94m📝 Full Text:[0m
'<|system|>
You are an helpful assistant.
<|user|>
Explain quantum mechanics in detail please.
<|assistant|>
Quantum mechanics is a fundamental theory in physics that describes the physical properties of nature at the scale of atoms and subatomic particles, such as electrons, protons, and photons.'
[94m🎯 Confidence Threshold:[0m 40.00%

──────────────────────────────────────────────────
| Step | Token           |  Probability | Status     |
|------|-----------------|--------------|------------|
|   55 | ,               |       56.61% | [92mAccepted  [0m |
|   56 |  such           |       89.17% | [92mAccepted  [0m |
|   57 |  as             |      100.00% | [92mAccepted  [0m |
|   58 |  electrons      |       98.04% | [92mAccepted  [0m |
|   59 | ,               |       95.45% | [92mAccepted  [0m |
|   60 | 

[{'token': ',', 'prob': 0.5661389231681824, 'status': 'Accepted'},
 {'token': ' such', 'prob': 0.8916944265365601, 'status': 'Accepted'},
 {'token': ' as', 'prob': 0.9999815225601196, 'status': 'Accepted'},
 {'token': ' electrons', 'prob': 0.9804161190986633, 'status': 'Accepted'},
 {'token': ',', 'prob': 0.9544870853424072, 'status': 'Accepted'},
 {'token': ' protons', 'prob': 0.5462462306022644, 'status': 'Accepted'},
 {'token': ',', 'prob': 0.9995439648628235, 'status': 'Accepted'},
 {'token': ' and', 'prob': 0.861538290977478, 'status': 'Accepted'},
 {'token': ' photons', 'prob': 0.9768282175064087, 'status': 'Accepted'},
 {'token': '.', 'prob': 0.9809023141860962, 'status': 'Accepted'}]

In [11]:
res = generate_draft_response(
    model_path=config.quantized_path,
    prompt=format_falcon_prompt(
        tokenizer=tokenizer,
        system_prompt=system_prompt,
        user_prompt=user_prompt
    ),
    ctx_size=config.n_ctx,
    n_threads=config.n_threads,
    max_new_tokens=config.max_new_tokens,
    seed=config.seed,
    verbose=config.verbose
)

NameError: name 'generate_draft_response' is not defined

In [None]:
result = speculative_decoding(
    tokenizer=tokenizer,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    target_model=model,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    confidence_threshold=config.assistant_confidence_threshold,
    seed=config.seed,
    verbose=True,
    server_url="http://localhost:8080"
)

In [None]:
asdf

In [None]:
if config.debug:
    print(format_falcon_prompt(
        tokenizer=tokenizer,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        assistant_response=assistant_response
    ))

In [None]:
generate_response(
    tokenizer=tokenizer,
    model=model,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    max_new_tokens=config.max_new_tokens,
    #num_assistant_tokens=config.num_assistant_tokens,
    #assistant_confidence_threshold=config.assistant_confidence_threshold,
    verbose=config.verbose
)

In [None]:
if config.debug:
    generate_response(
        tokenizer=tokenizer,
        model=quantized_model,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        max_new_tokens=config.max_new_tokens,
        #num_assistant_tokens=config.num_assistant_tokens,
        #assistant_confidence_threshold=config.assistant_confidence_threshold,
        verbose=config.verbose
    )

In [None]:
if config.debug:
    generate_response(
        tokenizer=tokenizer,
        model=model,
        assistant_model=quantized_model,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        max_new_tokens=config.max_new_tokens,
        num_assistant_tokens=config.num_assistant_tokens,
        assistant_confidence_threshold=config.assistant_confidence_threshold,
        verbose=config.verbose
    )

# C++

## Speculative Decoding

In [None]:
sample = generate_draft_response(
    model_path=config.quantized_path,
    prompt=format_falcon_prompt(
        tokenizer=tokenizer,
        system_prompt=system_prompt,
        user_prompt=user_prompt
    ),
    ctx_size=config.n_ctx,
    n_threads=config.n_threads,
    max_new_tokens=10,
    seed=config.seed,
    verbose=False
)
res = verify_with_target(
    context=sample,
    tokenizer=tokenizer,
    target_model=model,
    num_assistant_tokens=10,
    confidence_threshold=0.9,
    verbose=True
)

In [None]:
result = speculative_decoding(
    tokenizer=tokenizer,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    target_model=model,
    draft_model=config.quantized_path,
    ctx_size=config.n_ctx,
    n_threads=config.n_threads,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    confidence_threshold=config.assistant_confidence_threshold,
    seed=config.seed,
    verbose=True
)