# Preparation

In [None]:
"""
# Download model
huggingface-cli download tiiuae/Falcon3-3B-Instruct-1.58bit --local-dir ~/models/tiiuae/Falcon3-3B-Instruct-1.58bit
# Compile
python setup_env.py -md ~/models/tiiuae/Falcon3-3B-Instruct-1.58bit -q i2_s
"""

# Importing Libraries

In [None]:
import os
import gc
from dataclasses import dataclass
import torch
from transformers import AutoModelForCausalLM
from utils import set_seed
from speculative_decoding import BitNet

# Configuration

In [None]:
@dataclass
class CONFIG:
    # Debug
    debug: bool = False
    verbose: bool = True

    # Model
    ## Tokenizer
    tokenizer_id: str = "tiiuae/Falcon3-1B-Instruct"
    ## HuggingFace
    model_path: str       = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct"  # 3B
    model_small_path: str = "/home/pathfinder/models/tiiuae/Falcon3-1B-Instruct"  # 1B
    ## GGUF (1bit)
    bitnet_path: str = "/home/pathfinder/models/tiiuae/Falcon3-3B-Instruct-1.58bit/ggml-model-i2_s.gguf" # 3B
    ctx_size: int = 1024

    # Generation
    max_new_tokens: int = 100
    ## Speculative Decoding
    num_assistant_tokens: int = 5
    confidence_threshold: float = 0.4

    # Device
    n_threads: int = 12

    # Seed
    seed = 42

config = CONFIG()

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
set_seed(config.seed)

In [None]:
system_prompt = "You are an helpful assistant."
user_prompt = "Explain quantum mechanics in detail please."

# Model

In [None]:
bitnet = BitNet()
bitnet.start_server(
    bitnet_path=config.bitnet_path,
    ctx_size=config.ctx_size,
    n_threads=config.n_threads,
    verbose=config.verbose
)
bitnet.init_tokenizer(
    tokenizer_id=config.tokenizer_id,
    verbose=False
)
bitnet.init_model(
    model_path=config.model_path,
    verbose=True
)

In [None]:
text = bitnet.format_falcon_prompt(
    system_prompt=system_prompt,
    user_prompt=user_prompt
)
print(text)

# Generation

In [None]:
# 3B (fp32)
bitnet.generate_hf(
    text=text,
    max_new_tokens=config.max_new_tokens,
    stream=True,
    verbose=config.verbose
)

In [None]:
# 3B (1bit)
bitnet.generate_gguf(
    text=text,
    max_new_tokens=config.max_new_tokens,
    verbose=config.verbose
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Speculative Decoding

In [None]:
# 3B-1B
bitnet.speculative_decoding_hf(
    small_model=AutoModelForCausalLM.from_pretrained(
        config.model_small_path,
        device_map="cpu",
        dtype=torch.float32
    ),
    text=text,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    confidence_threshold=config.confidence_threshold,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

In [None]:
# 32bit-1bit
bitnet.speculative_decoding(
    text=text,
    max_new_tokens=config.max_new_tokens,
    num_assistant_tokens=config.num_assistant_tokens,
    confidence_threshold=config.confidence_threshold,
    verbose=True
)
gc.collect()
torch.cuda.empty_cache()

In [None]:
# 32bit-2bit

# Evaluation

## HellaSwag