In [3]:
import pandas as pd
import json
# from unsloth import FastLanguageModel
# from unsloth.chat_templates import get_chat_template
# from unsloth import is_bfloat16_supported
# from trl import SFTTrainer
from transformers import TrainingArguments, HfArgumentParser
from dataclasses import dataclass
from datasets import Dataset
import torch_optimizer
import torch

In [10]:
a = torch.Tensor()
b = torch.Tensor([1, 2])
a = torch.cat([a, b])
a

tensor([1., 2.])

In [25]:
from peft import LoraConfig, WeightLoraConfig
target_modules=['up_proj', 'down_proj', 'gate_proj', 
                    'k_proj', 'q_proj', 'v_proj', 'o_proj']
peft_config = WeightLoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

In [26]:
peft_config.__class__.__name__

'WeightLoraConfig'

In [2]:
@dataclass
class ModelArguments:
    #model_name: str = "/media/ssd-3t/akazakov/llama31instr/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693"
    # model_name: str = "unsloth/Meta-Llama-3.1-8B" 
    #model_name: str = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
    model_name: str = "FacebookAI/roberta-base"
    max_seq_length: int = 1000
    dtype: str = None
    load_in_4bit: bool = False

@dataclass
class TrainingArguments(TrainingArguments):
    per_device_train_batch_size: int = 16
    gradient_accumulation_steps: int = 4
    warmup_steps: int = 5
    num_train_epochs: int = 5
    learning_rate: float = 1e-10
    logging_steps: int = 1
    optim: str = "adamw_hf"
    weight_decay: float = 0.01
    lr_scheduler_type: str = "linear"
    seed: int = 18
    output_dir: str = "train_outputs"
    # output_dir: str = None
    sign_step: int = 5000
    max_grad_norm: float = 1.0
    max_steps: int = 2 # overrides num_train_epochs
    report_to: str = "none" # "none" or "wandb"
 
@dataclass
class DataArguments:
    train_file: str = 'data/train_ft_short_system.jsonl'

In [3]:
import pipelines.adapters as adapters
import pipelines.optimizers as optimizers
import pipelines.utils as utils
%load_ext autoreload
%autoreload 2

In [4]:
model_args = ModelArguments
training_args = TrainingArguments
utils.set_seed(18)
device = utils.set_device(3)

There are 8 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name,
    load_in_8bit=model_args.load_in_4bit,
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
# print(model)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [6]:
from peft import WeightLoraConfig, LoraConfig, LoKrConfig
from peft import LoKrModel, LoraModel, WeightLoraModel
from peft import get_peft_model
from peft import PeftConfig, PeftType
target_modules = ["query", "key", "value"]
weight_lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=target_modules,
    # rank_dropout=0.0,
    # module_dropout=0.0,
)
# model = WeightLoraModel(model, weight_lora_config, adapter_name="default")
# model = LoraModel(model, weight_lora_config, adapter_name="default")
model = get_peft_model(model, weight_lora_config)
model.print_trainable_parameters()
print(model)

trainable params: 442,368 || all params: 125,139,801 || trainable%: 0.3535
PeftModel(
  (base_model): LoraModel(
    (model): RobertaForCausalLM(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    

In [7]:
import datasets
dataset_name = 'cais/mmlu'
dataset_config_name = 'philosophy'
dataset = datasets.load_dataset(dataset_name, dataset_config_name)
train = utils.make_mlm_dataset_form_mmlu(dataset['test'])
test = utils.make_mlm_dataset_form_mmlu(dataset['validation'])
dataset = datasets.DatasetDict({"test" : test, "train" : train})
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True
)
print(tokenized_dataset)

Map: 100%|██████████| 34/34 [00:00<00:00, 1837.36 examples/s]
Map: 100%|██████████| 311/311 [00:00<00:00, 10784.86 examples/s]

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 34
    })
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 311
    })
})





In [50]:
# for name, param in model.named_parameters():
#     if "lm_head" in name or "embed" in name:
#         param.requires_grad = True

In [14]:
model.named_parameters()

<generator object Module.named_parameters at 0x7f8760328140>

In [15]:
from transformers import get_scheduler
optimizer = optimizers.AdamW(model.parameters(), 
                                 lr=training_args.learning_rate,
                                 weight_decay=training_args.weight_decay)
scheduler = get_scheduler(name=training_args.lr_scheduler_type, 
                            optimizer=optimizer, 
                            num_warmup_steps=training_args.warmup_steps,
                            num_training_steps=training_args.max_steps)

[autoreload of pipelines.optimizers failed: Traceback (most recent call last):
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
    if update_generic(old_obj, new_obj):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/home/shkodnik/Sber_Lora/ShkodnikVenv/li

In [16]:
import transformers
run_name = "test"
trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    #train_dataset=dataset,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=TrainingArguments(
        max_steps=10,
        # learning_rate=1e-3, 
        # fp16=True, 
        output_dir=training_args.output_dir, 
        use_cpu=False, 
        save_safetensors=False,
        # report_to=report_to,
        report_to="none",
        logging_steps=1,
        # run_name=f"prob={int(prob*100)}/100_k={k}",
        run_name=run_name,
        # run_name="test",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15),
    optimizers=[optimizer, scheduler]
)

trainer_stats = trainer.train()

max_steps is given, it will override any value given in num_train_epochs


274


Step,Training Loss
1,10.8251
2,10.8251
3,10.8251
4,10.8251
5,10.8251
6,10.8251
7,10.8251
8,10.8251
9,10.8251
10,10.8251


274
274
274
274
274
274
274
274
274




: 

In [17]:
from trl import SFTTrainer
?SFTTrainer

[0;31mInit signature:[0m
[0mSFTTrainer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mmodeling_utils[0m[0;34m.[0m[0mPreTrainedModel[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mmodules[0m[0;34m.[0m[0mmodule[0m[0;34m.[0m[0mModule[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0margs[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtraining_args[0m[0;34m.[0m[0mTrainingArguments[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_collator[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdata_collator[0m[0;34m.[0m[0mDataCollator[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_dataset[0m[0;34m:[0m [0mOpti

In [None]:
5e-4

In [71]:
for name, param in model.named_parameters():
    if "lora_A" in name:
        print(name, "; sum = ", param.sum().item())

base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.0.attention.self.key.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.1.attention.self.key.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.2.attention.self.key.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.layer.3.attention.self.query.lora_A.default.weight ; sum =  nan
base_model.model.roberta.encoder.lay