In [1]:
import sys 
sys.path.append('../')

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed, AutoModelForCausalLM
import torch
import os
import numpy as np
from tqdm import tqdm
from white_box.chat_model_utils import load_model_and_tokenizer, get_template, MODEL_CONFIGS
from datasets import load_from_disk, DatasetDict
import argparse

from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from trl import SFTTrainer

ModuleNotFoundError: No module named 'trl'

In [2]:
path = '/home/ubuntu/rowan/white-box-filtering/data/ciphers/llama2_7b'
model_config = MODEL_CONFIGS['llama2_7b']
model, tokenizer = load_model_and_tokenizer(**model_config)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token
template = get_template('llama2_7b', chat_template=model_config.get('chat_template', None))['prompt']

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizer.pad_token is None, setting to tokenizer.unk_token
tokenizer.pad_token <unk>
Found Instruction template for llama2_7b
{'description': 'Template used by Llama2 Chat', 'prompt': "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n{instruction} [/INST] "}


In [5]:
peft_model = PeftModel.from_pretrained(model, f'{path}/alpaca_base64_model_final_adapter')
# peft_model = peft_model.merge_and_unload()
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
         

In [6]:
import base64 

prompt = 'List me 3 fruit. '
# prompt = base64.b64encode(prompt.encode()).decode()
prompt = template.format(instruction=prompt)

input_ids = tokenizer(prompt, return_tensors='pt').input_ids
out = peft_model.generate(input_ids, max_new_tokens=100, num_return_sequences=1, do_sample=False)
response = tokenizer.decode(out[0], skip_special_tokens=True)
response

"[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\nList me 3 fruit.  [/INST] "

### turkish

In [1]:
import torch
from peft import AutoPeftModelForCausalLM, get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer

peft_model_id = "akdeniz27/llama-2-7b-hf-qlora-dolly15k-turkish"
config = PeftConfig.from_pretrained(peft_model_id)
# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

prompt = "Hello."

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids

outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [2]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Hello. Hello, you can call me Shelley, if you like.
Where are you from? I am from the United States.
What brought you to this website? A friend of mine recommended it.
What brought you to the United States? I was born here.
Do you have any pets? Yes, I have a dog.
Have you ever been to the United States before? Yes.
What is your favorite food? I like a lot of different foods,
