In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model 
import transformers
import torch
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "meta-llama/Llama-2-7b-chat-hf" 


In [3]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit = True,
    device_map="auto",

)


Downloading config.json: 100%|██████████| 614/614 [00:00<00:00, 1.26MB/s]
Downloading (…)fetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 5.22MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 9.98G/9.98G [05:15<00:00, 31.6MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 3.50G/3.50G [01:53<00:00, 30.9MB/s]
Downloading shards: 100%|██████████| 2/2 [07:10<00:00, 215.40s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.44s/it]
Downloading generation_config.json: 100%|██████████| 188/188 [00:00<00:00, 457kB/s]


In [4]:
ds = load_dataset(path="facebook/belebele",split="spa_Latn")

In [5]:
# Select the first five rows of the dataset for example prompts
ds_examples=ds.select(range(0,5))
ds_prompts=ds.select(range(5,len(ds)))

prompt_template="""{flores_passage}
Question: {question}
Answer A: {mc_answer1}
Answer B: {mc_answer2}
Answer C: {mc_answer3}
Answer D: {mc_answer4}
Correct answer: {correct_answer}"""

# Prepare example prompts for 5-shot prompting
choices=["A","B","C","D"]
prompt_examples = "\n\n".join([ prompt_template.format(**d,correct_answer=choices[int(d["correct_answer_num"])-1]) for d in ds_examples])

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
 

In [7]:
tokenizer_path = "meta-llama/Llama-2-7b-chat-hf" 

In [8]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)


Downloading tokenizer_config.json: 100%|██████████| 1.62k/1.62k [00:00<00:00, 4.51MB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 54.3MB/s]
Downloading tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 4.41MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 973kB/s]


In [9]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [10]:
# parse model response and extract the model'schoice
def parse_choice(response):
    choices=["A","B","C","D"]
    
    if len(response)==1:
        return choices.index(response[0]) + 1 if response[0] in choices else None
    elif response[0] in choices and not response[1].isalpha():
        return choices.index(response[0]) + 1
    else:
        return None

# sampling parameters: llama-precise
gen_config = {
    "temperature": 0.7,
    "top_p": 0.1,
    "repetition_penalty": 1.18,
    "top_k": 40,
    "do_sample": True,
    "max_new_tokens": 5,
    "pad_token_id": pipeline.tokenizer.eos_token_id,
}

# Loop through prompts and evaluate model responses
q_correct = q_total = 0
for rowNo, row in enumerate(tqdm(ds_prompts)):        
    # Construct the prompt by combining the example prompts and the current row's question
    prompt=(prompt_examples + "\n\n" + prompt_template.format(**row, correct_answer="")).strip()

    # Generate a response from the model
    response=pipeline(prompt, **gen_config)[0]["generated_text"][len(prompt):]
    if "\n" in response:
        response=response.split("\n")[0]

    # Parse the model's choice and compare it to the correct answer
    choice=parse_choice(response.strip())
    if choice==int(row["correct_answer_num"]):
        q_correct+=1 
    q_total+=1

print(f"{q_total} questions, {q_correct} correct ({round(q_correct/q_total*100,1)}%)")  

  0%|          | 0/895 [00:00<?, ?it/s]

100%|██████████| 895/895 [14:37<00:00,  1.02it/s]

895 questions, 489 correct (54.6%)





In [11]:
#model.load_adapter("/home/cslab02/Desktop/TesisSebasMena/text-generation-webui/loras/llama2-chat-7b-spanish-256-saved/")

In [12]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.