In [1]:
import sys
sys.path.insert(1, "..")
sys.path.insert(2, "../..")

import tqdm
import torch
import pandas as pd;
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from huggingface_hub import login
import time
import os

from patch import *

In [2]:
DEVICE = "cuda" # cpu or cuda
METHOD = "Naive" # RSR or Naive

In [3]:
# Don't run this cell if you want to see the 'Optimized Standard Inference'
apply_patch(method=METHOD)

In [4]:
if DEVICE == "cpu":
    os.environ["CUDA_VISIBLE_DEVICES"] = ''
    
os.environ["HF_TOKEN"]="YOUR TOKEN"
print("cuda available? ", torch.cuda.is_available())

cuda available?  True


In [5]:
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)
device = torch.device(DEVICE)

model = LlamaForCausalLM.from_pretrained("HF1BitLLM/Llama3-8B-1.58-100B-tokens", 
                                         device_map=DEVICE, 
                                         torch_dtype=torch.bfloat16).to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


HF_Quantizer: <transformers.quantizers.quantizer_bitnet.BitNetHfQuantizer object at 0x7f5d6034ec20>
** hf_quantizer.preprocess_model
** hf_quantizer.postprocess_model




Loading tensors: 224 / 224

### Inference

In [7]:
def infer(prompt):
    start_time = time.time()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)#.cuda()
    output = model.generate(input_ids, max_length=20, do_sample=False, max_new_tokens=1);
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True);
    print(f"{prompt} --> {generated_text}")
    return {
        "time": time.time() - start_time,
        "response": generated_text,
        "device": DEVICE,
        "method": METHOD
    }

In [6]:
df = pd.read_csv("../dataset.csv")
df.head()

Unnamed: 0,q
0,What is the capital of France?
1,Who wrote Romeo and Juliet?
2,What is 2 + 2?
3,What is the largest planet in the solar system?
4,Who painted the Mona Lisa?


In [7]:
result = []

for i, question in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    result.append(infer(prompt=question["q"])) 

  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input ID: torch.Size([1, 8]), Input Embed:torch.Size([1, 8, 4096])


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
 10%|█         | 1/10 [00:49<07:29, 49.91s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the capital of France? --> What is the capital of France? Paris
Input ID: torch.Size([1, 7]), Input Embed:torch.Size([1, 7, 4096])


 20%|██        | 2/10 [01:30<05:57, 44.66s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Who wrote Romeo and Juliet? --> Who wrote Romeo and Juliet? William
Input ID: torch.Size([1, 9]), Input Embed:torch.Size([1, 9, 4096])


 30%|███       | 3/10 [02:21<05:30, 47.18s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is 2 + 2? --> What is 2 + 2? 
Input ID: torch.Size([1, 11]), Input Embed:torch.Size([1, 11, 4096])


 40%|████      | 4/10 [03:21<05:14, 52.44s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the largest planet in the solar system? --> What is the largest planet in the solar system? What
Input ID: torch.Size([1, 7]), Input Embed:torch.Size([1, 7, 4096])


 50%|█████     | 5/10 [04:01<03:59, 47.98s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Who painted the Mona Lisa? --> Who painted the Mona Lisa? Leonardo
Input ID: torch.Size([1, 10]), Input Embed:torch.Size([1, 10, 4096])


 60%|██████    | 6/10 [04:56<03:21, 50.30s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the square root of 16? --> What is the square root of 16? 
Input ID: torch.Size([1, 9]), Input Embed:torch.Size([1, 9, 4096])


 70%|███████   | 7/10 [05:46<02:31, 50.35s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the tallest mountain on Earth? --> What is the tallest mountain on Earth?|

Input ID: torch.Size([1, 13]), Input Embed:torch.Size([1, 13, 4096])


 80%|████████  | 8/10 [06:56<01:53, 56.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the name of the first man on the moon? --> What is the name of the first man on the moon? What
Input ID: torch.Size([1, 10]), Input Embed:torch.Size([1, 10, 4096])


 90%|█████████ | 9/10 [07:52<00:56, 56.45s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Both `max_new_tokens` (=1) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


What is the currency of the United States? --> What is the currency of the United States?
Input ID: torch.Size([1, 7]), Input Embed:torch.Size([1, 7, 4096])


100%|██████████| 10/10 [08:32<00:00, 51.22s/it]

How many continents are there? --> How many continents are there? How





In [8]:
pd.DataFrame(result).to_csv("result_Naive_optimized.csv", index=0)

In [13]:
pd.read_csv("result_Naive_optimized.csv", index_col=False)["time"].describe()

count    10.000000
mean     51.215071
std       9.714081
min      39.175900
25%      43.215917
50%      50.315896
75%      55.860463
max      69.866591
Name: time, dtype: float64

In [12]:
pd.read_csv("result_Naive.csv", index_col=False)["time"].describe()

count     10.000000
mean     418.087706
std       86.416195
min      306.833085
25%      344.506441
50%      417.863952
75%      456.791743
max      578.312830
Name: time, dtype: float64

In [11]:
pd.read_csv("result_RSR.csv", index_col=False)["time"].describe()

count    10.000000
mean     50.314275
std       9.682739
min      38.136520
25%      43.882524
50%      49.962099
75%      54.579380
max      69.389142
Name: time, dtype: float64