In [1]:
import os, sys
import warnings


warnings.filterwarnings(
    "ignore", category=UserWarning, module="torchvision.io.image", lineno=13
)
warnings.filterwarnings(
    "ignore",
    message="The installed version of bitsandbytes was compiled without GPU support.*",
    category=UserWarning,
    module='bitsandbytes.cextension'
)
warnings.filterwarnings("ignore")
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="This implementation of AdamW is deprecated",
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NUMEXPR_MAX_THREADS"] = "28"
os.environ["ENABLE_SDP_FUSION"] = "true"
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"]="1"

In [2]:
from datasets import load_dataset
import torch
device = torch.device("cpu")
emotions = load_dataset("dair-ai/emotion", trust_remote_code=True , download_mode='force_redownload')

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer

model_name = "openlm-research/open_llama_3b_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

In [4]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification
num_labels = 6
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_3b_v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
emotions_encoded["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [7]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
emotions_encoded["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [9]:
from transformers import Trainer, TrainingArguments
import torch.optim as optim

batch_size = 16
logging_steps = len(emotions_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  #optim='adamw_hf',
                                  report_to=[],
                                  bf16=True,
                                  use_ipex=True)

2024-07-15 07:47:24.426499: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 07:47:25.507094: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


/home/u79bece8b01d17b507a6b09a7919d56b/.local/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
[2024-07-15 07:47:42,536] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cpu (auto detect)


In [10]:
from peft import get_peft_model, LoraConfig, PeftModel

LORA_CONFIG = LoraConfig(
    r=8, 
    lora_alpha=16, 
    #target_modules=["query", "value"], 
    target_modules = ["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model = get_peft_model(model, LORA_CONFIG)

In [11]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"])
trainer.train(); 

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9643,0.830996,0.705,0.672208


In [12]:
results = trainer.evaluate() #without lora 0.935
results

{'eval_loss': 0.8309964537620544,
 'eval_accuracy': 0.705,
 'eval_f1': 0.6722076898511364,
 'eval_runtime': 108.7362,
 'eval_samples_per_second': 18.393,
 'eval_steps_per_second': 1.15,
 'epoch': 1.0}

In [None]:
preds_output = trainer.predict(emotions_encoded["test"])
preds_output.metrics #without lora 0.935

In [None]:
import numpy as np
from sklearn.metrics import plot_confusion_matrix
y_valid = np.array(emotions_encoded["validation"]["label"])
y_preds = np.argmax(preds_output.predictions, axis=1)
labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
plot_confusion_matrix(y_preds, y_valid, labels)

In [13]:
model.save_pretrained('./llama_model')
tokenizer.save_pretrained('./llama_model')

('./llama_model/tokenizer_config.json',
 './llama_model/special_tokens_map.json',
 './llama_model/tokenizer.model',
 './llama_model/added_tokens.json',
 './llama_model/tokenizer.json')

In [3]:
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, LlamaTokenizer
import torch

model_pat = "./llama_model"

model = AutoModel.from_pretrained(model_pat)

# Load state_dict manually
checkpoint = torch.load(model_pat + "/adapter_model.bin")
model.load_state_dict(checkpoint, strict=False)  # `strict=False` allows partial loading

model = AutoModelForSequenceClassification.from_pretrained(model_pat, num_labels=6)
tokenizer = AutoTokenizer.from_pretrained(model_pat)
tokenizer.pad_token = tokenizer.eos_token


# inputs = "i think someone is outside the house"
# inputs = tokenizer(inputs, padding='max_length', truncation=True, return_tensors='pt')
# with torch.no_grad():
#     outputs = model(**inputs)

# logits = outputs.logits
# predicted_class = logits.argmax(dim=-1).item()
# print(outputs)
# print("Predicted class:", predicted_class)

Loading adapter weights from ./llama_model led to unexpected keys not found in the model:  ['model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.k_proj.lora_A.default.weight', 'model.layers.0.self_attn.k_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.1.self_attn.q_proj.lora_A.default.weight', 'model.layers.1.self_attn.q_proj.lora_B.default.weight', 'model.layers.1.self_attn.k_proj.lora_A.default.weight', 'model.layers.1.self_attn.k_proj.lora_B.default.weight', 'model.layers.1.self_attn.v_proj.lora_A.default.weight', 'model.layers.1.self_attn.v_proj.lora_B.default.weight', 'model.layers.2.self_attn.q_proj.lora_A.default.weight', 'model.layers.2.self_attn.q_proj.lora_B.default.weight', 'model.layers.2.self_attn.k_proj.lora_A.default.weight', 'model.layers.2.self_attn.k_proj.lora_B.default.weigh

RuntimeError: Error(s) in loading state_dict for LlamaForSequenceClassification:
	size mismatch for model.layers.0.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.0.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.0.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.0.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.0.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.0.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.1.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.1.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.1.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.1.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.1.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.1.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.2.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.2.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.2.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.2.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.2.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.2.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.3.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.3.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.3.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.3.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.3.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.3.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.4.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.4.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.4.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.4.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.4.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.4.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.5.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.5.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.5.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.5.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.5.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.5.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.6.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.6.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.6.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.6.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.6.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.6.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.7.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.7.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.7.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.7.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.7.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.7.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.8.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.8.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.8.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.8.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.8.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.8.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.9.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.9.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.9.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.9.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.9.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.9.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.10.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.10.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.10.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.10.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.10.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.10.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.11.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.11.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.11.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.11.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.11.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.11.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.12.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.12.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.12.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.12.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.12.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.12.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.13.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.13.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.13.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.13.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.13.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.13.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.14.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.14.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.14.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.14.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.14.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.14.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.15.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.15.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.15.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.15.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.15.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.15.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.16.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.16.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.16.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.16.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.16.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.16.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.17.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.17.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.17.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.17.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.17.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.17.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.18.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.18.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.18.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.18.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.18.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.18.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.19.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.19.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.19.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.19.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.19.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.19.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.20.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.20.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.20.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.20.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.20.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.20.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.21.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.21.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.21.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.21.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.21.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.21.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.22.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.22.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.22.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.22.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.22.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.22.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.23.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.23.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.23.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.23.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.23.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.23.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.24.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.24.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.24.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.24.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.24.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.24.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.25.self_attn.q_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.25.self_attn.q_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.25.self_attn.k_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.25.self_attn.k_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for model.layers.25.self_attn.v_proj.lora_A.default.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([8, 3200]).
	size mismatch for model.layers.25.self_attn.v_proj.lora_B.default.weight: copying a param with shape torch.Size([50, 1, 16, 64, 2]) from checkpoint, the shape in current model is torch.Size([3200, 8]).
	size mismatch for score.weight: copying a param with shape torch.Size([1, 100, 16, 16, 2]) from checkpoint, the shape in current model is torch.Size([6, 3200]).