In [1]:
import sys
sys.path.insert(1, "..")
sys.path.insert(2, "../..")

import tqdm
import torch
import pandas as pd;
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from huggingface_hub import login
import time
import os
import re
import random

from patch_gpu import *

In [2]:
DEVICE = "cuda" # cpu or cuda
METHOD = "RSR" # RSR or Naive
MODEL = "tiiuae/Falcon3-10B-Instruct-1.58bit" # [HF1BitLLM/Llama3-8B-1.58-100B-tokens, tiiuae/Falcon3-3B-Instruct-1.58bit]
TOKENIZER = "tiiuae/Falcon3-10B-Instruct-1.58bit" # [meta-llama/Meta-Llama-3-8B-Instruct, tiiuae/Falcon3-3B-Instruct-1.58bit]

In [3]:
# Don't run this cell if you want to see the 'Optimized Standard Inference'
apply_patch(method=METHOD)

In [None]:
if DEVICE == "cpu":
    os.environ["CUDA_VISIBLE_DEVICES"] = ''
    
os.environ["HF_TOKEN"]=""
print("cuda available? ", torch.cuda.is_available())

In [None]:
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)
device = torch.device(DEVICE)

model = LlamaForCausalLM.from_pretrained(MODEL, 
                                         device_map=DEVICE, 
                                         torch_dtype=torch.bfloat16).to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(TOKENIZER)

### Inference

In [6]:
def infer(prompt, max_length=20, tokens_to_generate=1):
    start_time = time.time()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)#.cuda()
    output = model.generate(input_ids, max_length=max_length, do_sample=False, max_new_tokens=tokens_to_generate);
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True);
    print(f"{prompt} --> {generated_text}")
    return {
        "time": time.time() - start_time,
        "response": generated_text,
        "device": DEVICE,
        "method": METHOD
    }

In [None]:
infer(prompt="France")

### Extract Report

In [4]:
pattern = r'(RSR|Standard) time: (\d*\.?\d+(?:[eE][-+]?\d+)?)'

with open("output.txt", "r") as f: # output.txt is the output log of 'inference' cell
    lines = f.readlines()
    
result = []

for line in lines:
    match = re.search(pattern, line)
    if match:
        type = match.group(1)
        time = float(match.group(2))
        result.append({
            "method": type,
            "time": time
        })

In [5]:
result = pd.DataFrame(result)
result.head()

Unnamed: 0,method,time
0,RSR,0.000256
1,Standard,0.000263
2,RSR,0.000182
3,Standard,6.3e-05
4,RSR,0.00021


In [None]:
result[result["method"] == "RSR"]["time"].describe()

In [None]:
result[result["method"] == "Standard"]["time"].describe()

In [7]:
result.to_csv("falcon_times_report.csv", index=0)