In [None]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
! pip uninstall transformers -y
! pip uninstall  adapter-transformers -y
!rm -rf ./adapters-master
!git clone --single-branch --branch master https://github.com/ReDASers/adapters-master.git
! pip install --no-cache torch==2.1 torchaudio torchvision torchtext datasets evaluate  ./adapters-master/.

[0mFound existing installation: adapter-transformers 3.2.1
Uninstalling adapter-transformers-3.2.1:
  Successfully uninstalled adapter-transformers-3.2.1
Cloning into 'adapters-master'...
remote: Enumerating objects: 123410, done.[K
remote: Counting objects: 100% (22327/22327), done.[K
remote: Compressing objects: 100% (1633/1633), done.[K
remote: Total 123410 (delta 21114), reused 20725 (delta 20668), pack-reused 101083[K
Receiving objects: 100% (123410/123410), 79.02 MiB | 13.18 MiB/s, done.
Resolving deltas: 100% (94823/94823), done.
Processing ./adapters-master
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: adapter-transformers
  Building wheel for adapter-transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for adapter-transformers: filename=adapter_transformers-3.2.1-py3-none-any.whl size=6430350 sha

In [None]:
import torch
torch.__version__

'2.1.0+cu121'

In [None]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the MIT-style license found in the
# LICENSE file in the root directory of this source tree.

# FAdam (Fisher Adam): an implentation in PyTorch of the paper:
# "FAdam: Adam is a natural gradient optimizer using diagonal empirical Fisher information"
# https://www.arxiv.org/abs/2405.12807

import torch
import math
from torch import nn
from transformers import AutoModelForSequenceClassification, Trainer
import dataclasses
import os
from dataclasses import dataclass, field
from typing import Dict, Optional
import numpy as np
from transformers.adapters import AdapterTrainer, BertModelWithHeads, BertAdapterModel
from transformers import BertConfig, AutoTokenizer, EvalPrediction, GlueDataset
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    TrainingArguments,
    glue_output_modes,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed,
)
from transformers.adapters import ConfigUnion, IA3Config, PrefixTuningConfig
from transformers.optimization import Adafactor, AdafactorSchedule


from transformers.adapters import (
    AdapterFusionConfig,
    IA3Config,
    CompacterConfig,
    AutoAdapterModel,
    LoRAConfig,
    ParallelConfig,
    PrefixTuningConfig,
    PromptTuningConfig,
    Parallel,
    MAMConfig,
    PfeifferInvConfig,
    CompacterPlusPlusConfig,
    AdapterConfig,
    AdapterCompositionBlock,
    PfeifferConfig,
    HoulsbyConfig,
)
import evaluate
from evaluate import load
from datasets import load_dataset


from torch.optim.optimizer import Optimizer
from typing import Tuple, Optional


class FAdam(Optimizer):
    def __init__(
        self,
        params,
        lr: float = 1e-3,
        weight_decay: float = 0.1,
        betas: Tuple[float, float] = (0.9, 0.999),
        clip: float = 1.0,
        p: float = 0.5,
        eps: float = 1e-8,
        momentum_dtype: torch.dtype = torch.float32,
        fim_dtype: torch.dtype = torch.float32,
    ):
        """
        Args:
            params (iterable): iterable of parameters to optimize or dicts defining
                parameter groups
            lr (float, optional): learning rate (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-15)
            clip (float, optional): maximum norm of the gradient (default: 1.0)
            TODO - explain p

            # Usage
            TODO
        """
        defaults = dict(
            lr=lr,
            betas=betas,
            weight_decay=weight_decay,
            eps=eps,
            momentum_dtype=momentum_dtype,
            fim_dtype=fim_dtype,
            clip=clip,
            p=p,
        )

        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure: Optional[callable] = None) -> Optional[float]:
        """Performs a single optimization step.
        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                # to fix linter, we do not keep the returned loss for use atm.
                loss = closure()

        for group in self.param_groups:
            beta1, beta2 = group["betas"]
            lr = group["lr"]
            eps = group["eps"]
            clip = group["clip"]
            pval = group["p"]
            momentum_dtype = group["momentum_dtype"]
            fim_dtype = group["fim_dtype"]
            weight_decay = group["weight_decay"]

            for p in group["params"]:
                if p.grad is None:
                    continue

                if p.grad.is_sparse:
                    raise RuntimeError("FAdam does not support sparse gradients")

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state.setdefault("step", torch.tensor(0.0))
                    state.setdefault(
                        "momentum", torch.zeros_like(p, dtype=momentum_dtype)
                    )
                    state.setdefault("fim", torch.ones_like(p, dtype=fim_dtype))


                # main processing -------------------------

                # update the steps for each param group update
                state["step"] += 1
                step = state["step"]

                momentum = state["momentum"]
                fim = state["fim"]
                grad = p.grad

                # begin FAdam algo -------------------------
                # 6 - beta2 bias correction per Section 3.4.4
                curr_beta2 = beta2 * (1 - beta2 ** (step - 1)) / (1 - beta2**step)

                # 7 - update fim
                fim.mul_(curr_beta2).add_(grad * grad, alpha=1 - curr_beta2)

                # 8 - adaptive epsilon
                rms_grad = torch.sqrt(torch.mean((grad * grad)))
                curr_eps = eps * min(1, rms_grad)

                # 9 - compute natural gradient
                fim_base = fim**pval + curr_eps  # **(2*pval)

                grad_nat = grad / fim_base

                # 10 - clip the natural gradient
                rms = torch.sqrt(torch.mean(grad_nat**2))
                divisor = max(1, rms)
                divisor = divisor / clip
                grad_nat = grad_nat / divisor

                # 11 - update momentum
                momentum.mul_(beta1).add_(grad_nat, alpha=1 - beta1)

                # 12 - weight decay
                grad_weights = p / fim_base

                # 13 - clip weight decay
                rms = torch.sqrt(torch.mean(grad_weights**2))
                divisor = max(1, rms)
                divisor /= clip
                grad_weights = grad_weights / divisor

                # 14 - compute update
                full_step = momentum + (weight_decay * grad_weights)
                lr_step = lr * full_step

                # 15 - update weights
                p.sub_(lr_step)

        return loss

In [51]:


set_seed(42)

TASK_NAME = "sst-2"
K = 1000
MODEL_NAME = "bert-base-cased"
OUTPUT_DIR = "/content/output"

GLUE_METRICS = {
    'cola': 'matthews_correlation',
    'sst2': 'accuracy',
    'mrpc': 'f1',
    'qqp': 'accuracy',
    'stsb': 'pearsonr',
    'mnli': 'accuracy',
    'qnli': 'accuracy',
    'rte': 'accuracy',
    'wnli': 'accuracy',
}

def get_sentence_keys():
    task_to_keys = {
        "cola": ("sentence", None),
        "mnli": ("premise", "hypothesis"),
        "mnli-mm": ("premise", "hypothesis"),
        "mrpc": ("sentence1", "sentence2"),
        "qnli": ("question", "sentence"),
        "qqp": ("question1", "question2"),
        "rte": ("sentence1", "sentence2"),
        "sst2": ("sentence", None),
        "stsb": ("sentence1", "sentence2"),
        "wnli": ("sentence1", "sentence2"),
    }
    s1, s2 = task_to_keys[TASK_NAME.replace('-', '')]
    return s1, s2

def get_metric_name():
    return GLUE_METRICS[TASK_NAME.replace('-', '')]

def get_metric_for_task():
    return load('glue', TASK_NAME.replace('-', ''))

def get_output_mode():
    return glue_output_modes[TASK_NAME]

def get_num_labels():
    return glue_tasks_num_labels[TASK_NAME]

def configure_model():
    num_labels = get_num_labels()



    '''
    qconfig = BitsAndBytesConfig(load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
    )
    '''
    qbert_config = BertConfig.from_pretrained(
        MODEL_NAME,
        finetuning_task=TASK_NAME,
        num_labels=num_labels,
        device_map="auto",
        quantization_config=None,
        torch_dtype=torch.float16,
    )
    model = BertAdapterModel.from_pretrained(MODEL_NAME, config=qbert_config)

    r = 64
    alpha = float(int(r / math.sqrt(r)))
    print(alpha)

    config = ConfigUnion(
        LoRAConfig(selfattn_lora=True, dropout=0.01, attn_matrices=[ "q","v", "K"],
                   alpha=alpha, r=r, use_gating=True),
        PrefixTuningConfig(prefix_length=20,use_gating=True),

        #ParallelConfig(mh_adapter=False,
         #              output_adapter=False,
         #              use_gating=True,
         #              reduction_factor=3,
        #               scaling='learned',
        #               non_linearity="relu"),

        #PromptTuningConfig(
        #    prompt_length=64,
            #prompt_init="from_string",
            #prompt_init_text="Review sentiment (0=negative, 1=positive):",
        #    combine="prefix",
        #    scaling = 'learned',
        #    use_gating=True,
        #),

        #PfeifferInvConfig(use_gating=True, reduction_factor=16, output_adapter=False),
        #HoulsbyConfig(reduction_factor=4,
        #              use_gating=True,
        #              output_adapter=False,
        #              scaling=2.0),
        CompacterPlusPlusConfig(mh_adapter=True, reduction_factor=8, output_adapter=True, scaling='learned', use_gating=False),
        #PfeifferConfig(reduction_factor=16, use_gating=True)
    )

    model.add_adapter("prexia2", config=config)
    model.add_classification_head(
        f"prexia2",
        num_labels=num_labels,

    )
    model.train_adapter('prexia2')
    model.set_active_adapters("prexia2")
    '''
    config = ConfigUnion(
        LoRAConfig(selfattn_lora=True, intermediate_lora=True, output_lora=True,
                   attn_matrices=["q", "v", "k"], alpha=64, r=16, use_gating=False),
        #PrefixTuningConfig(prefix_length=10, bottleneck_size=800, use_gating=True),
        #ParallelConfig(reduction_factor=2, use_gating=True),
    )


    model.add_adapter("prexia2", config=config)

    model.add_classification_head(
        f"prexia2",
        num_labels=num_labels,

    )

    model.set_active_adapters("prexia2")

    model.train_adapter('prexia2')

    #model.adapter_to("prexia2", device="cuda")

    for param in model.parameters():
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
    '''

    return model

def get_dataset():
    ds = load_dataset("glue", TASK_NAME.replace('-', ''))
    if K == "all":
        k = len(ds["train"])
    else:
        k = K
        dev_k = ds["train"].train_test_split(test_size=k, seed=42)
        ds["train"] = dev_k['test']
        assert k == len(ds["train"]), "Training set must be of size K!"
    return ds

def encode_dataset(dataset):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=512)
    sentence1_key, sentence2_key = get_sentence_keys()

    def preprocess_function(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True, max_length=512)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=512)

    encoded = dataset.map(preprocess_function, batched=True)
    cols_to_remove = [sentence1_key]
    if sentence2_key is not None:
        cols_to_remove.append(sentence2_key)
    if "idx" in encoded.column_names:
        cols_to_remove.append("idx")
    encoded = encoded.remove_columns(cols_to_remove)
    if "label" in encoded.column_names:
      encoded = encoded.rename_column("label", "labels")
    encoded.set_format("torch")
    return encoded, tokenizer

def configure_trainer(model, tokenizer, encoded_dataset):
    output_mode = get_output_mode()
    num_labels = get_num_labels()
    metric = get_metric_for_task()
    metric_name = get_metric_name()

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1) if output_mode == "classification" else np.squeeze(p.predictions)
        return metric.compute(predictions=preds, references=p.label_ids)


    # replace AdamW with Adafactor
    '''
    optimizer = Adafactor(
        model.parameters(),
        lr=None,
        eps=(1e-30, 1e-3),
        clip_threshold=1.0,
        decay_rate=-0.8,
        beta1=None,
        weight_decay=1e-5,
        relative_step=True,
        scale_parameter=False,
        warmup_init=True,
    )
    '''
    #optimizer =  FAdam(model.parameters(), lr=1e-3)

    #lr_scheduler = AdafactorSchedule(optimizer)
    training_args = TrainingArguments(
        learning_rate=1e-3,
        warmup_steps=100,
        weight_decay=1e-3,
        adam_epsilon=1e-5,
        max_grad_norm=5.0,
        num_train_epochs=50,
        seed=42,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        logging_strategy="epoch",
        optim="adagrad",
        output_dir=OUTPUT_DIR,
        logging_dir=os.path.join(OUTPUT_DIR, "logs"),
        overwrite_output_dir=True,
        gradient_checkpointing=True,
        group_by_length=True,
        remove_unused_columns=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=5,

        #bf16=True,
        fp16=True,
        lr_scheduler_type="linear",
        dataloader_drop_last=True,
        gradient_accumulation_steps=1,
        label_smoothing_factor=0.01 if output_mode == "classification" else 0,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        greater_is_better=True if metric_name != "matthews_correlation" else False,
    )

    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        #optimizers=(optimizer, lr_scheduler),
    )

    return trainer

model = configure_model()
dataset = get_dataset()
encoded_dataset, tokenizer = encode_dataset(dataset)
#model = torch.compile(model)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "sst-2",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float16",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf

8.0


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 512,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/vocab.txt


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [52]:
trainer = configure_trainer(model, tokenizer, encoded_dataset)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertAdapterModel.forward` and have been ignored: idx. If idx are not expected by `BertAdapterModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 50
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 15919754
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method t

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6883,0.697388,0.509615
2,0.6569,0.650673,0.536058
3,0.6209,0.562414,0.71875
4,0.5363,0.511379,0.745192
5,0.4839,0.42451,0.8125
6,0.4306,0.449363,0.792067
7,0.4412,0.386442,0.817308
8,0.4565,0.3805,0.825721
9,0.4226,0.376851,0.834135
10,0.4105,0.390723,0.826923


The following columns in the evaluation set don't have a corresponding argument in `BertAdapterModel.forward` and have been ignored: idx. If idx are not expected by `BertAdapterModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 64
Saving model checkpoint to /content/output/checkpoint-15
Configuration saved in /content/output/checkpoint-15/prexia2/adapter_config.json
Module weights saved in /content/output/checkpoint-15/prexia2/pytorch_adapter.bin
Configuration saved in /content/output/checkpoint-15/prexia2/head_config.json
Module weights saved in /content/output/checkpoint-15/prexia2/pytorch_model_head.bin
Configuration saved in /content/output/checkpoint-15/prexia2/head_config.json
Module weights saved in /content/output/checkpoint-15/prexia2/pytorch_model_head.bin
tokenizer config file saved in /content/output/checkpoint-15/tokenizer_config.json
Special tokens file saved in /content/output/checkpoint-15/special_toke

TrainOutput(global_step=750, training_loss=0.4017338180541992, metrics={'train_runtime': 124.9812, 'train_samples_per_second': 400.06, 'train_steps_per_second': 6.001, 'total_flos': 736899449257728.0, 'train_loss': 0.4017338180541992, 'epoch': 50.0})

In [None]:
from torch import nn



def init_bert_weights(module):
        """Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # std defaults to 0.02, this might need to be changed
            module.weight.data.normal_(mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

class AdapterGate(nn.Module):
    def __init__(self, in_size):
        super().__init__()
        self.linear1 = nn.Linear(in_size, in_size)
        self.linear2 = nn.Linear(in_size, in_size)

    def forward(self, X):
        return torch.sigmoid(self.linear1(X) * self.linear2(X))

gate = AdapterGate(12)
x = torch.randn(12)
gate.apply(init_bert_weights)
print(gate(x))



tensor([0.4998, 0.5002, 0.5000, 0.5000, 0.5000, 0.5000, 0.5001, 0.5000, 0.5000,
        0.4998, 0.5000, 0.4995], grad_fn=<SigmoidBackward0>)


In [None]:
for i in range(12):
  model.bert.encoder.layer[i].attention.self.query = LinearWithDoRA(model.bert.encoder.layer[i].attention.self.query, rank=8, alpha=16)
  model.bert.encoder.layer[i].attention.self.key = LinearWithDoRA(model.bert.encoder.layer[i].attention.self.key, rank=8, alpha=16)
  model.bert.encoder.layer[i].attention.self.value = LinearWithDoRA(model.bert.encoder.layer[i].attention.self.value, rank=8, alpha=16)
  # model.bert.encoder.layer[i].intermediate.dense = LinearWithDoRA(model.bert.encoder.layer[i].intermediate.dense, rank=8, alpha=16)
  # model.bert.encoder.layer[i].output.dense = LinearWithDoRA(model.bert.encoder.layer[i].output.dense, rank=8, alpha=16)
model

def freeze_linear_layers(model):
    for child in model.children():
        if isinstance(child, nn.Linear):
            for param in child.parameters():
                param.requires_grad = False
        else:
            # Recursively freeze linear layers in children modules
            freeze_linear_layers(child)
freeze_linear_layers(model)

# Check if linear layers are frozen
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

def prefix_attention_mask(attention_mask):
    # Get the batch size and sequence length
    batch_size, seq_len = attention_mask.shape

    # Create a new attention mask with the correct dimensions
    prefix_attention_mask = torch.zeros(batch_size, seq_len, seq_len)

    # Set the first 8 tokens of each sequence to be visible
    prefix_attention_mask[:, :, :8] = 1

    return prefix_attention_mask

In [None]:
TASK_NAME = "sst-2"
# number of samples to use from the  set
K = 1000

GLUE_METRICS = {
    'cola': 'matthews_correlation',
    'sst2': 'accuracy',
    'mrpc': 'f1',
    'qqp': 'accuracy',
    'stsb': 'pearsonr',
    'mnli': 'accuracy',
    'qnli': 'accuracy',
    'rte': 'accuracy',
    'wnli': 'accuracy',
}
# base model name. Some use bass some large
MODEL_NAME = "bert-base-cased"



import torch
from transformers import BitsAndBytesConfig
import dataclasses
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np


from adapters import AdapterTrainer
from transformers import BertConfig, BertTokenizer,AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, GlueDataset
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    HfArgumentParser,
    TrainingArguments,
    glue_output_modes,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed,
)
from adapters import ConfigUnion, IA3Config, PrefixTuningConfig

from adapters import (
    AdapterFusionConfig,
    PromptTuningConfig,
    IA3Config,
    CompacterConfig,
    AutoAdapterModel,
    LoRAConfig,
    DoRAConfig,
    ParBnConfig,
    PrefixTuningConfig,
    DoubleSeqBnInvConfig,
)
import evaluate
from evaluate import load
from datasets import load_dataset



def get_sentence_keys():
  task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
  }

  s1, s2 = task_to_keys[datasets_task_name()]
  return s1, s2

def datasets_task_name():
  return TASK_NAME.replace('-','')

def get_metric_name():
  return GLUE_METRICS[datasets_task_name()]


def get_metric_for_task():
  return load('glue', get_metric_name())

def get_output_dir():
  return os.path.join(
        "/content/output/",
        f"prexia-{TASK_NAME}")

def get_output_mode():
  # Is this task classification or regression
  output_mode = glue_output_modes[TASK_NAME]
  return output_mode

def get_num_labels():
  return glue_tasks_num_labels[TASK_NAME]

OUTPUT_DIR = get_output_dir()




!mkdir /content/output



print("GLUE Task:",TASK_NAME,
      get_output_mode(),
      get_num_labels())


def get_metric_name():
    # Dynamic retrieval of metric name based on the dataset task name
    return GLUE_METRICS[datasets_task_name()]

def get_metric_for_task():
    # Load the appropriate metric from the evaluate library
    metric_name = get_metric_name()
    return load(metric_name)

mkdir: cannot create directory ‘/content/output’: File exists
GLUE Task: sst-2 classification 2


In [None]:
model.to("cuda")
model.adapter_to("prexia2", device="cuda")

In [None]:
model

BertAdapterModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict(
                  (prexia2): DoRA(
                    (lora_dropout): Dropout(p=0.05, inplace=False)
                    (gate): Linear(in_features=768, out_features=1, bias=True)
                  )
                )
                (last): DoRA(
                  (lora_dropout): Dropout(p=0.05, inplace=False)
             

In [None]:
trainer.train()



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [None]:
def configure_model():
  # Find out home many labels we must use for this task
  num_labels = get_num_labels()

  bert_config = BertConfig.from_pretrained(
    MODEL_NAME,
    finetuning_task=TASK_NAME,
    num_labels=num_labels,
    hidden_dropout_prob=0.05,  # Doropout probability for the hidden layers
    attention_probs_dropout_prob=0.05  # Dropout probability for the attention probabilities
  )
  tokenizer = BertTokenizer.from_pretrained(
    MODEL_NAME,
    truncation=True,
    padding='max_length',
    max_length=256)




def adapter_v2_state_from_state_dict(state_dict: dict) -> dict:
    """Returns the model state dict with only the adapter weights for saving."""
    return {name: param for name, param in state_dict.items()
            if any(s in name for s in get_adapter_substrings())}


def adapter_v2_new_forward(self, input: Tensor) -> Tensor:
    return self.adapter_scale * (
        F.linear(input, self.weight, self.bias) + self.adapter_bias
    )


def adapter_v2_linear_with_bias_and_scale(layer):
    layer.adapter_bias = torch.nn.Parameter(torch.zeros(layer.weight.shape[0]), requires_grad=True)
    layer.adapter_scale = torch.nn.Parameter(torch.ones(layer.weight.shape[0]), requires_grad=True)
    bound_method = adapter_v2_new_forward.__get__(layer, layer.__class__)
    setattr(layer, 'forward', bound_method)
    return layer


def add_adapter_v2_parameters_to_linear_layers(model):
    for module in model.modules():
        if isinstance(module, nn.Linear):
            adapter_v2_linear_with_bias_and_scale(module)
  '''
  model = BertModelWithHeads.from_pretrained(
    MODEL_NAME,
    config=bert_config,
  )

  config = ConfigUnion(
      IA3Config(use_gating=True),
      PrefixTuningConfig(prefix_length=30, bottleneck_size=800, use_gating=True),
      ParallelConfig(use_gating=True)
  )

  model.add_adapter("prexia2", config=config)

  model.add_classification_head(
      f"prexia2",
      num_labels=num_labels,

  )

  model.set_active_adapters( "prexia")

  model.train_adapter('prexia')
  return model, tokenizer
  '''
  config = ConfigUnion(
    LoRAConfig(selfattn_lora=True, intermediate_lora=True, output_lora=True, attn_matrices=["q", "k", "v"], alpha=16, r=64, dropout=0.1, use_gating=True),
    IA3Config(use_gating=True),
    PrefixConfig(prefix_length=30, dynamic=True, use_gating=True),
    BnConfig(levels=3, reduction_factor=16, use_gating=True),
  )
  model = AutoAdapterModel.from_pretrained("bert-base-cased")

  # Add adapters to the model
  model.add_adapter("prexia2", config=config)

  model.add_classification_head(
      f"prexia2",
      num_labels=num_labels,

  )

  model.set_active_adapters( "prexia2")

  model.train_adapter('prexia2')
  return model, tokenizer



# Activate and train adapters
#3model.train_adapter(["adaptive_prefix", "hierarchical_bottleneck", "qlora_adapter", "adapter_fusion", "prompt_tuning", "ia3_adapter", "compacter_adapter"])

#return model, tokenizer