In [1]:
%load_ext autoreload
%autoreload 2

import utils

In [2]:
# прикол для сервера
import random
import numpy as np
import torch
import os
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = 'false'

def set_seed(seed): # ставит сид
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def set_device(device_no: int): # выбирает GPU-шку и выводит название
    if torch.cuda.is_available():
        device = torch.device(f"cuda:{device_no}")
        print("There are %d GPU(s) available." % torch.cuda.device_count())
        print("We will use the GPU:", torch.cuda.get_device_name(device_no))
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")

    return device

set_seed(18)
# device = set_device(7)
print(f'GPU available? {torch.cuda.is_available()}')

GPU available? False


## Model

### Downloading the model

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "FacebookAI/roberta-base"
weight_type = torch.float32 # in gpu can be torch.float16
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=weight_type,
    load_in_8bit=False,
    device_map='cpu',
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(model)
device = model.device
print(f'model device = {device}')

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [15]:
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

# class CastOutputToFloat(nn.Sequential):
#     def forward(self, x): return super().forward(x).to(weight_type)
# model.lm_head = CastOutputToFloat(model.lm_head)

### Choose trainable parameters

In [16]:
layer_filter = lambda name : ('attention' in name) and \
    (('query' in name) or ('value' in name)) and \
    ('bias' not in name)
    
print(f"Chosen layers: ")
chosen_layers = []
for name, param in model.named_modules():
    if layer_filter(name):
        right_name = name
        if model_name == "FacebookAI/roberta-base":
            chosen_layers.append(name)
        print(f"#{len(chosen_layers)}:  name = {name}")

print(f"Overall chosen layers: {len(chosen_layers)}")

Chosen layers: 
#1:  name = roberta.encoder.layer.0.attention.self.query
#2:  name = roberta.encoder.layer.0.attention.self.value
#3:  name = roberta.encoder.layer.1.attention.self.query
#4:  name = roberta.encoder.layer.1.attention.self.value
#5:  name = roberta.encoder.layer.2.attention.self.query
#6:  name = roberta.encoder.layer.2.attention.self.value
#7:  name = roberta.encoder.layer.3.attention.self.query
#8:  name = roberta.encoder.layer.3.attention.self.value
#9:  name = roberta.encoder.layer.4.attention.self.query
#10:  name = roberta.encoder.layer.4.attention.self.value
#11:  name = roberta.encoder.layer.5.attention.self.query
#12:  name = roberta.encoder.layer.5.attention.self.value
#13:  name = roberta.encoder.layer.6.attention.self.query
#14:  name = roberta.encoder.layer.6.attention.self.value
#15:  name = roberta.encoder.layer.7.attention.self.query
#16:  name = roberta.encoder.layer.7.attention.self.value
#17:  name = roberta.encoder.layer.8.attention.self.query
#18:  n

In [17]:
chosen_layers = chosen_layers[:2] # baby steps

In [18]:
import torch.nn as nn
for name, param in model.named_parameters():
    # if name not in chosen_layers:
    #     param.requires_grad = False
    param.requires_grad = False
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32) 
for param in model.lm_head.parameters():
    param.requires_grad = False

In [19]:
utils.print_trainable_parameters(model)

trainable params: 0 || all params: 124697433 || trainable%: 0.0


In [21]:
# add adapter
for i in range(0, len(chosen_layers) // 2):
    model.roberta.encoder.layer[i].attention.self.query = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.query)
    model.roberta.encoder.layer[i].attention.self.value = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.value)

In [23]:
utils.print_trainable_parameters(model)

trainable params: 1179648 || all params: 125877081 || trainable%: 0.9371427988547018


### Inference of the model

In [24]:
from transformers import pipeline
sentence = "I enjoy to <mask> in sberbank."
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)
print(f"Original sentence: {sentence}")
predictions = unmasker(sentence)
for i in range(len(predictions)):
    print(f"#{i+1}: {predictions[i]['sequence']} || score = {predictions[i]['score']}")

  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/andrey/miniconda3/lib/python3.10/site-packages/torchvision/image.so
  Expected in:     <EACD001F-FCB9-380E-AD73-D522177FC040> /Users/andrey/miniconda3/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")


Original sentence: I enjoy to <mask> in sberbank.
#1: I enjoy to invest in sberbank. || score = 0.6629555225372314
#2: I enjoy to work in sberbank. || score = 0.13997869193553925
#3: I enjoy to trade in sberbank. || score = 0.02613656222820282
#4: I enjoy to participate in sberbank. || score = 0.02500171773135662
#5: I enjoy toiling in sberbank. || score = 0.020179422572255135


## Dataset

### Downloading the dataset

In [25]:
from datasets import load_dataset

dataset_name = 'cais/mmlu'
dataset_config_name = 'philosophy'

# Can be changed to wiki (as in Micrisoft):
# dataset_name = 'wikitext'
# dataset_config_name = 'wikitext-2-raw-v1'

dataset = load_dataset(dataset_name, dataset_config_name)
dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 311
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 34
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

### Preprocessing

In [26]:
# only for MMLU
num = 1
print(f"question: {dataset['test']['question'][num]}")
print(f"subject: {dataset['test']['subject'][num]}")
print(f"choices: {dataset['test']['choices'][num]}")
print(f"answer: {dataset['test']['answer'][num]}")

question: For Socrates, an unexamined life is a tragedy because it results in grievous harm to _____.
subject: philosophy
choices: ['the state', 'the justice system', 'the body', 'the soul']
answer: 3


In [27]:
import datasets
def make_mlm_dataset_form_mmlu(dataset, head=3):
    dataset_list = []
    for a in dataset:
        q = a['question']
        q = q.replace('_', '')
        q += ' ' + a['choices'][a['answer']]
        q = q.replace('.', '')
        q = q.replace('  ', ' ')
        q += '.'
        dataset_list.append({"text" : q})

    if head > 0:
        print("Examples:")
    for i, a in enumerate(dataset_list[:head]):
        print(f"#{i+1}: {a['text']}")

    return_dataset = datasets.Dataset.from_list(dataset_list)
    return return_dataset

In [28]:
print('TRAIN')
train = make_mlm_dataset_form_mmlu(dataset['test'])
print(f'NUM ROWS = {len(train)}', '-'*100, sep='\n')
print('TEST')
test = make_mlm_dataset_form_mmlu(dataset['validation'])
print(f'NUM ROWS = {len(test)}', '-'*100, sep='\n')
print('VALIDATION')
val = make_mlm_dataset_form_mmlu(dataset['dev'])
print(f'NUM ROWS = {len(val)}', '-'*100, sep='\n')
dataset = datasets.DatasetDict({"test" : test,
                                "train" : train,
                                "validation" : val})
dataset

TRAIN
Examples:
#1: Aesthetics deals with objects that are not essential to our existence.
#2: For Socrates, an unexamined life is a tragedy because it results in grievous harm to the soul.
#3: According to Kant, nothing can be called “good” without qualification except a good will.
NUM ROWS = 311
----------------------------------------------------------------------------------------------------
TEST
Examples:
#1: One of the aims of philosophy is to think critically about whether there are good reasons for adopting our beliefs Reasons are considered "good reasons" if they are consistent with everyday experience and: take into account objections, are acceptable to impartial third parties, and avoid undesirable consequences.
#2: The existence of a form of mental illness known as multiple personality disorder seems to suggest that the mind is divisible.
#3: Singer’s argument begins with the assumption that: suffering and death from lack of food, shelter, and medical care are bad.
NUM ROW

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 34
    })
    train: Dataset({
        features: ['text'],
        num_rows: 311
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5
    })
})

### Applying tokenizer

In [29]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
)

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

## Train

### Wandb

In [30]:
import wandb
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingfac

True

In [31]:
import os
os.environ["WANDB_PROJECT"] = "SBER_LORA"

In [50]:
# reload model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=weight_type,
    load_in_8bit=False,
    device_map='cpu',
)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
for name, param in model.named_parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32) 
for param in model.lm_head.parameters():
    param.requires_grad = False

for i in range(0, len(chosen_layers) // 2):
    model.roberta.encoder.layer[i].attention.self.query = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.query)
    model.roberta.encoder.layer[i].attention.self.value = \
        utils.AdapterLayer(model.roberta.encoder.layer[i].attention.self.value)
utils.print_trainable_parameters(model)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


trainable params: 1179648 || all params: 125877081 || trainable%: 0.9371427988547018


In [51]:
prob = 0.
k = 400
optimizer = utils.StoIHT(model.parameters(), k=k, 
                         approx=utils.approx_0, proj=utils.proj_0, lr=1e-1, prob=prob)

In [52]:
import transformers
import time
Time = str(time.ctime()).replace("  ", " ").replace(" ", "_").replace(':', '-')

ars = transformers.TrainingArguments(
    # per_device_train_batch_size=1, 
    # gradient_accumulation_steps=16, 
    # warmup_steps=10, 
    max_steps=15, 
    # learning_rate=1e-3, 
    fp16=False, 
    output_dir=f"my_lora/outputs/{prob}_{k}", 
    use_cpu=True, 
    save_safetensors=False,
    # report_to="wandb",
    logging_steps=1,
    run_name=f"prob={prob}/k={k}",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=ars,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15),
    optimizers=[optimizer, None],
)

max_steps is given, it will override any value given in num_train_epochs


In [53]:
# model.save_pretrained(output_dir, safe_serialization=False)
try:
    ret = trainer.train()
    wandb.finish()
except Exception as err:
    print("ERROR!")
    print(err)
    wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Artifact name may only contain alphanumeric characters, dashes, underscores, and dots. Invalid name: model-prob=0.0/k=400


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [203]:
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 18.2847900390625,
 'eval_runtime': 1.8194,
 'eval_samples_per_second': 18.687,
 'eval_steps_per_second': 2.748,
 'epoch': 0.05128205128205128}