In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import numpy as np
import torch
from torch.nn.utils import parametrize
from torch import nn

# import wandb

# import os
# os.environ["WANDB_PROJECT"] = "lora-from-scratch"
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# wandb.login()

def get_device():
    import torch
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    print(f"Using device {device}")
    return device

In [3]:
# device = get_device()
device = torch.device("cpu")
print(device)

cpu


In [18]:
class LoRAParametrization(nn.Module):
  def __init__(self, features_in, features_out, rank=1, alpha=1):
    super().__init__()

    self.lora_A = nn.Parameter(torch.zeros(rank, features_out))
    self.lora_B = nn.Parameter(torch.zeros(features_in, rank))
    nn.init.normal_(self.lora_A, mean=0, std=1)

    self.scale = alpha / rank
    self.enabled = True

  def forward(self, original_weights):
    if self.enabled:
      device = original_weights.device
      return original_weights + torch.matmul(self.lora_B, self.lora_A).view(original_weights.shape).to(device) * self.scale
    else:
      return original_weights

In [29]:
def linear_layer_parametrization(layer, rank=1, lora_alpha=1):
  features_in, features_out = layer.weight.shape
  return LoRAParametrization(features_in, features_out, rank, lora_alpha)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text = "This was a great movie"
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)
print(output)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutput(loss=None, logits=tensor([[-0.2346,  0.0063]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [21]:
for param in model.parameters():
    param.requires_grad = False

model.pre_classifier.weight.requires_grad = True
model.classifier.weight.requires_grad = True
model.pre_classifier.bias.requires_grad = True
model.classifier.bias.requires_grad = True

In [22]:
imdb = load_dataset("imdb")

In [23]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [24]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [26]:

repo_name = "lora-from-scratch"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch"
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01114450597777679, max=1.0)…

Problem at: /Users/pierrecounathe/Desktop/projects/simple-implementations/.env/lib/python3.11/site-packages/wandb/sdk/wandb_init.py 854 getcaller


wandb: Network error (ConnectionError), entering retry loop.


KeyboardInterrupt: 

In [14]:
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 1.47 GB, other allocations: 4.65 GB, max allowed: 6.77 GB). Tried to allocate 768.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [30]:
parametrize.register_parametrization(
    model.pre_classifier, "weight", linear_layer_parametrization(model.pre_classifier)
)

parametrize.register_parametrization(
    model.classifier, "weight", linear_layer_parametrization(model.classifier)
)

ParametrizedLinear(
  in_features=768, out_features=2, bias=True
  (parametrizations): ModuleDict(
    (weight): ParametrizationList(
      (0): LoRAParametrization()
    )
  )
)

In [31]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [34]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011119386044444783, max=1.0…

Problem at: /Users/pierrecounathe/Desktop/projects/simple-implementations/.env/lib/python3.11/site-packages/wandb/sdk/wandb_init.py 854 getcaller


wandb: Network error (ConnectionError), entering retry loop.


KeyboardInterrupt: 