In [1]:
# To crate venv use 
# python -m venv `/path/to/new/virtual/environment`
import torch
import numpy
import transformers
import datasets 
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# прикол для сервера
import random
import numpy as np
import torch
import os
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = 'false'

def set_seed(seed): # ставит сид
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def set_device(device_no: int): # выбирает GPU-шку и выводит название
    if torch.cuda.is_available():
        device = torch.device(f"cuda:{device_no}")
        print("There are %d GPU(s) available." % torch.cuda.device_count())
        print("We will use the GPU:", torch.cuda.get_device_name(device_no))
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")

    return device

set_seed(18)
device = set_device(2)
print(f'GPU available? {torch.cuda.is_available()}')

There are 8 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-80GB
GPU available? True


## Model

### Downloading the model

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "FacebookAI/roberta-base"
weight_type = torch.float16 # in gpu can be torch.float16
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=weight_type,
    load_in_8bit=False,
    device_map=device,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(model)
device = model.device
print(f'model device = {device}')

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 



In [5]:
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

# class CastOutputToFloat(nn.Sequential):
#     def forward(self, x): return super().forward(x).to(weight_type)
# model.lm_head = CastOutputToFloat(model.lm_head)

### Choose trainable parameters

In [6]:
layer_filter = lambda name : ('attention' in name) and \
    (('query' in name) or ('value' in name)) and \
    ('bias' not in name)
    
print(f"Chosen layers: ")
chosen_layers = [] 
for name, param in model.named_modules():
    if layer_filter(name):
        right_name = name
        if model_name == "FacebookAI/roberta-base":
            chosen_layers.append(name)
        print(f"#{len(chosen_layers)}:  name = {name}")

print(f"Overall chosen layers: {len(chosen_layers)}")

Chosen layers: 
#1:  name = roberta.encoder.layer.0.attention.self.query
#2:  name = roberta.encoder.layer.0.attention.self.value
#3:  name = roberta.encoder.layer.1.attention.self.query
#4:  name = roberta.encoder.layer.1.attention.self.value
#5:  name = roberta.encoder.layer.2.attention.self.query
#6:  name = roberta.encoder.layer.2.attention.self.value
#7:  name = roberta.encoder.layer.3.attention.self.query
#8:  name = roberta.encoder.layer.3.attention.self.value
#9:  name = roberta.encoder.layer.4.attention.self.query
#10:  name = roberta.encoder.layer.4.attention.self.value
#11:  name = roberta.encoder.layer.5.attention.self.query
#12:  name = roberta.encoder.layer.5.attention.self.value
#13:  name = roberta.encoder.layer.6.attention.self.query
#14:  name = roberta.encoder.layer.6.attention.self.value
#15:  name = roberta.encoder.layer.7.attention.self.query
#16:  name = roberta.encoder.layer.7.attention.self.value
#17:  name = roberta.encoder.layer.8.attention.self.query
#18:  n

In [7]:
chosen_layers = chosen_layers[:2] # baby steps

In [8]:
import torch.nn as nn
for name, param in model.named_parameters():
    # if name not in chosen_layers:
    #     param.requires_grad = False
    param.requires_grad = False
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32) 
for param in model.lm_head.parameters():
    param.requires_grad = False

In [9]:
import pipelines.utils as utils
utils.print_trainable_parameters(model)

trainable params: 0 || all params: 124697433 || trainable%: 0.0


In [10]:
# add adapter
for i in range(0, len(chosen_layers) // 2):
    model.roberta.encoder.layer[i].attention.self.query = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.query)
    model.roberta.encoder.layer[i].attention.self.value = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.value)

In [11]:
utils.print_trainable_parameters(model)

trainable params: 1179648 || all params: 125877081 || trainable%: 0.9371427988547018


In [12]:
model

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): AdapterLayer(
                (module): Linear(in_features=768, out_features=768, bias=True)
                (adapter): Linear(in_features=768, out_features=768, bias=False)
              )
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): AdapterLayer(
                (module): Linear(in_features=768, out_features=768, bias=True)
                (adapter): Linear(in_feature

### Inference of the model

In [10]:
# from transformers import pipeline
# sentence = "I enjoy to <mask> in sberbank."
# unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer, device=device)
# print(f"Original sentence: {sentence}")
# predictions = unmasker(sentence)
# for i in range(len(predictions)):
#     print(f"#{i+1}: {predictions[i]['sequence']} || score = {predictions[i]['score']}")

## Dataset

### Downloading the dataset

In [13]:
from datasets import load_dataset

dataset_name = 'cais/mmlu'
dataset_config_name = 'philosophy'

# Can be changed to wiki (as in Micrisoft):
# dataset_name = 'wikitext'
# dataset_config_name = 'wikitext-2-raw-v1'

dataset = load_dataset(dataset_name, dataset_config_name)
dataset

Downloading readme: 100%|██████████| 53.2k/53.2k [00:00<00:00, 483kB/s]
Downloading metadata: 100%|██████████| 138k/138k [00:00<00:00, 3.85MB/s]


DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 311
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 34
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

### Preprocessing

In [14]:
# only for MMLU
num = 1
print(f"question: {dataset['test']['question'][num]}")
print(f"subject: {dataset['test']['subject'][num]}")
print(f"choices: {dataset['test']['choices'][num]}")
print(f"answer: {dataset['test']['answer'][num]}")

question: For Socrates, an unexamined life is a tragedy because it results in grievous harm to _____.
subject: philosophy
choices: ['the state', 'the justice system', 'the body', 'the soul']
answer: 3


In [15]:
import datasets
def make_mlm_dataset_form_mmlu(dataset, head=3):
    dataset_list = []
    for a in dataset:
        q = a['question']
        q = q.replace('_', '')
        q += ' ' + a['choices'][a['answer']]
        q = q.replace('.', '')
        q = q.replace('  ', ' ')
        q += '.'
        dataset_list.append({"text" : q})

    if head > 0:
        print("Examples:")
    for i, a in enumerate(dataset_list[:head]):
        print(f"#{i+1}: {a['text']}")

    return_dataset = datasets.Dataset.from_list(dataset_list)
    return return_dataset

In [16]:
print('TRAIN')
train = make_mlm_dataset_form_mmlu(dataset['test'])
print(f'NUM ROWS = {len(train)}', '-'*100, sep='\n')
print('TEST')
test = make_mlm_dataset_form_mmlu(dataset['validation'])
print(f'NUM ROWS = {len(test)}', '-'*100, sep='\n')
print('VALIDATION')
val = make_mlm_dataset_form_mmlu(dataset['dev'])
print(f'NUM ROWS = {len(val)}', '-'*100, sep='\n')
dataset = datasets.DatasetDict({"test" : test,
                                "train" : train,
                                "validation" : val})
dataset

TRAIN
Examples:
#1: Aesthetics deals with objects that are not essential to our existence.
#2: For Socrates, an unexamined life is a tragedy because it results in grievous harm to the soul.
#3: According to Kant, nothing can be called “good” without qualification except a good will.
NUM ROWS = 311
----------------------------------------------------------------------------------------------------
TEST
Examples:
#1: One of the aims of philosophy is to think critically about whether there are good reasons for adopting our beliefs Reasons are considered "good reasons" if they are consistent with everyday experience and: take into account objections, are acceptable to impartial third parties, and avoid undesirable consequences.
#2: The existence of a form of mental illness known as multiple personality disorder seems to suggest that the mind is divisible.
#3: Singer’s argument begins with the assumption that: suffering and death from lack of food, shelter, and medical care are bad.
NUM ROW

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 34
    })
    train: Dataset({
        features: ['text'],
        num_rows: 311
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5
    })
})

In [17]:
type(model)

transformers.models.roberta.modeling_roberta.RobertaForCausalLM

### Applying tokenizer

In [18]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True
)

Map: 100%|██████████| 34/34 [00:00<00:00, 1604.34 examples/s]
Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map: 100%|██████████| 311/311 [00:00<00:00, 11654.80 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 1354.92 examples/s]


In [25]:
tokenized_dataset

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 34
    })
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 311
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 5
    })
})

## Train

### Wandb

In [17]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mshkodnik[0m ([33mshkodnik-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [18]:
import os
os.environ["WANDB_PROJECT"] = "SBER_LORA"

In [139]:
# reload model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=weight_type,
    load_in_8bit=False,
    device_map=device
)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
for name, param in model.named_parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32) 
for param in model.lm_head.parameters():
    param.requires_grad = False

for i in range(0, len(chosen_layers) // 2):
    model.roberta.encoder.layer[i].attention.self.query = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.query)
    model.roberta.encoder.layer[i].attention.self.value = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.value)
utils.print_trainable_parameters(model)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


trainable params: 14155776 || all params: 138853209 || trainable%: 10.194777709458627


In [19]:
wandb_config = {"model_name" : model_name,
                "optimizer" : "StoIHT",
                "prob" : 1., # c какой вероятностью делать approx
                "k" : 50,
                "approx_func" : utils.approx_0,
                "proj_func" : utils.proj_0,
                "lr" : 0.1,
                "max_steps" : 100}
report_to = "none" # "none" or "wandb"

AttributeError: module 'pipelines.utils' has no attribute 'approx_0'

In [20]:
if wandb_config["optimizer"] == "StoIHT":    
    optimizer = utils.StoIHT(model.parameters(), k=wandb_config["k"], 
                             approx=wandb_config["approx_func"], 
                             proj=wandb_config["proj_func"], 
                             lr=wandb_config["lr"], 
                             prob=wandb_config["prob"])

NameError: name 'wandb_config' is not defined

In [23]:
import transformers
import time
Time = str(time.ctime()).replace("  ", " ").replace(" ", "_").replace(':', '-')

ars = transformers.TrainingArguments(
    # per_device_train_batch_size=1, 
    # gradient_accumulation_steps=16, 
    # warmup_steps=10, 
    # max_steps=wandb_config["max_steps"], 
    max_steps=10,
    # learning_rate=1e-3, 
    fp16=True, 
    output_dir=f"my_lora/outputs/{Time}", 
    use_cpu=False, 
    save_safetensors=False,
    # report_to=report_to,
    report_to="none",
    logging_steps=1,
    # run_name=f"prob={int(prob*100)}/100_k={k}",
    run_name=f"{Time}",
    # run_name="test"
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=ars,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15),
    optimizers=[None, None],
)

max_steps is given, it will override any value given in num_train_epochs


In [24]:
ret = trainer.train()

Step,Training Loss
1,17.5679
2,15.3683
3,17.1707
4,18.4098
5,16.9691
6,16.1388
7,16.2351
8,14.2836
9,15.1389
10,15.6612


In [41]:
try:
    ret = trainer.train()
    if report_to == "wandb":
        wandb.config.update(wandb_config)
        wandb.finish()
except Exception as err:
    print("ERROR!")
    print(err)
    if report_to == "wandb":
        wandb.finish()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,7.3786
2,8.0805
3,6.9909
4,8.505
5,7.6886
6,6.4699
7,7.3129
8,7.2248
9,7.6534
10,6.7475


: 

In [203]:
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 18.2847900390625,
 'eval_runtime': 1.8194,
 'eval_samples_per_second': 18.687,
 'eval_steps_per_second': 2.748,
 'epoch': 0.05128205128205128}