In [15]:
# set the model info
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
new_model = "/project/models/NV-llama3.1-8b-Arxiv"
api_key = "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"

In [16]:
# Define your special tokens
bos_token = "<bos>"
eos_token = "<eos>"
pad_token = "<pad>"
user_start = "<user>"
user_end = "</user>"
assistant_start = "<assistant>"
assistant_end = "</assistant>"

special_tokens = {
    'bos_token': bos_token,
    'eos_token': eos_token,
    'pad_token': pad_token,
    'additional_special_tokens': [user_start, user_end, assistant_start, assistant_end]
}

In [17]:
# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

In [18]:
from transformers import AutoModelForCausalLM
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, PeftModel

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key
)
tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = pad_token


# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    token=api_key,
    quantization_config=bnb_config,
    cache_dir="/project/models",
    device_map="auto"
)

# Update model's embeddings to accommodate new tokens
base_model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Embedding(128263, 4096)

In [19]:
peft_model = PeftModel.from_pretrained(base_model, "/project/models/arxiv_model").to("cuda")

In [23]:
# Define the format_example function
def format_example(instruction, response=""):
    return f"{bos_token}\n{user_start}\n{instruction}\n{user_end}\n{assistant_start}\n{response}"

# Prepare the input
instruction = "I am looking for a paper discussing You Only Read Once(YORO)."
input_text = format_example(instruction)

In [24]:
# Tokenize the input
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=512,
    padding=True
).to("cuda")

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

In [25]:
# Generate the response
with torch.no_grad():
    output = peft_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=5,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.5,
        num_return_sequences=1,
        eos_token_id=tokenizer.convert_tokens_to_ids(eos_token),
        pad_token_id=tokenizer.convert_tokens_to_ids(pad_token)
    )

# Decode and extract the assistant's response
generated_text = tokenizer.decode(output[0], skip_special_tokens=False)

print(generated_text)

<|begin_of_text|><bos>
<user>
I am looking for a paper discussing You Only Read Once(YORO).
</user>
<assistant>
One paper that discusses this topic is 'You Only Read Once (YORO): Exploring the Paradox of Reading Without Improving Comprehension". While it is widely believed that reading improves language model
comprehension and general knowledge, a recent study has surprisingly found that
this assumption may be a misconception. Based on this finding, this work
proposes a new reading strategy that only requires a model to read an article
once without any subsequent testing or evaluation, i.e. YORO. We investigate
the effectiveness of YORO by evaluating models' comprehension, knowledge, and
performance in related tasks both before and after the reading process.
Surprisingly, our results show that YORO is effective in improving models'
comprehension, general knowledge, and performance in related tasks. However,
under different evaluation settings, YORO only leads to positive changes in
lan