In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,3,4,6"
os.environ['TRANSFORMERS_CACHE'] = '/raid/rabikov/hf_cache/'
os.environ['HF_HOME'] = '/raid/rabikov/hf_cache/'

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaTokenizer, LlamaForCausalLM
import torch





  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model_id = "decapoda-research/llama-30b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [3]:

tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id,
                                            quantization_config=bnb_config,
                                            device_map="auto")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /raid/rabikov/anaconda3/envs/conda_taxonomy/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so
CUDA SETUP: CUDA runtime path found: /raid/rabikov/anaconda3/envs/conda_taxonomy/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /raid/rabikov/anaconda3/envs/conda_taxonomy/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
  warn(msg)
Loading checkpoint shards: 100%|██████████| 61/61 [01:50<00:00,  1.81s/it]


In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=16, 
   # target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [6]:
model.print_trainable_parameters()

trainable params: 12,779,520 || all params: 16,490,646,016 || trainable%: 0.07749556923119148


In [7]:
txt = tokenizer('hello how are you doing?')

In [8]:
label = tokenizer.encode('i am doing well', return_tensors='pt', add_special_tokens=False)

In [26]:
label[0]

tensor([ 474,  626, 2599, 1532])

In [9]:
input_seq = torch.concat([txt['input_ids'], label], dim=1)

TypeError: expected Tensor as element 0 in argument 0, but got list

In [10]:
processed_term  = 'how are you?'
target = "great!"


encoded_term = tokenizer.encode(
    processed_term, return_tensors='pt'
)
encoded_target = tokenizer.encode(target, return_tensors='pt', add_special_tokens=False)

input_seq = torch.concat([encoded_term, encoded_target], dim=1)
labels = input_seq.clone()
labels[0, : encoded_term.size()[1]] = -100

att_mask_inputs = torch.zeros_like(input_seq)
att_mask_inputs[input_seq != 0] = 1

In [11]:
labels, input_seq

(tensor([[ -100,  -100,  -100,  -100,  -100,  2107, 29991]]),
 tensor([[    0,   920,   526,   366, 29973,  2107, 29991]]))

In [12]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [13]:
loss = out.loss

In [14]:
loss

tensor(5.7479, grad_fn=<ToCopyBackward0>)

In [43]:
loss.backward()

In [44]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [45]:
optimizer.step()

In [46]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [48]:
out.loss.backward()

In [49]:
optimizer.zero_grad()
optimizer.step()

In [50]:
out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)

In [52]:
for i in range(10):
    optimizer.zero_grad()

    out = model.forward(input_ids=input_seq, attention_mask=att_mask_inputs, labels=labels)
    loss = out.loss
    print(loss)
    loss.backward()
    optimizer.step()

tensor(5.7083, grad_fn=<ToCopyBackward0>)
tensor(5.6528, grad_fn=<ToCopyBackward0>)
tensor(5.5960, grad_fn=<ToCopyBackward0>)
tensor(5.5263, grad_fn=<ToCopyBackward0>)
tensor(5.4481, grad_fn=<ToCopyBackward0>)
tensor(5.3696, grad_fn=<ToCopyBackward0>)
tensor(5.2882, grad_fn=<ToCopyBackward0>)
tensor(5.1871, grad_fn=<ToCopyBackward0>)
tensor(5.0791, grad_fn=<ToCopyBackward0>)
tensor(4.9481, grad_fn=<ToCopyBackward0>)
