In [1]:
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, AdapterConfig

In [12]:
from transformers.adapters.composition import Fuse


In [79]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
sum(p.numel() for p in model.base_model.parameters())

108891648

In [85]:
sum(p.numel() for p in model.cls.parameters())

24063546

In [83]:
model

BertForMaskedLM(
  (bert): BertModel(
    (invertible_adapters): ModuleDict()
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [71]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [72]:
model.config

BertConfig {
  "_name_or_path": "../models/fusion_test/fusion_test/",
  "adapters": {
    "adapters": {
      "atLocation": "eff32a71a397d9d8",
      "isA": "eff32a71a397d9d8",
      "usedFor": "eff32a71a397d9d8"
    },
    "config_map": {
      "eff32a71a397d9d8": {
        "adapter_residual_before_ln": false,
        "cross_adapter": false,
        "inv_adapter": null,
        "inv_adapter_reduction_factor": null,
        "leave_out": [],
        "ln_after": false,
        "ln_before": false,
        "mh_adapter": true,
        "non_linearity": "swish",
        "original_ln_after": true,
        "original_ln_before": false,
        "output_adapter": true,
        "reduction_factor": 12,
        "residual_before_ln": true
      }
    },
    "fusion_config_map": {},
    "fusions": {
      "isA,usedFor,atLocation": "dynamic"
    }
  },
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
 

In [73]:
adapter_setup = [
    [
        "atLocation",
        "usedFor",
        "isA"
    ]
]

In [77]:
model.set_active_adapters(Fuse(*model.config.adapters.adapters.keys()))

In [78]:
model.active_adapters

Fuse[atLocation, isA, usedFor]

In [75]:
list(model.config.adapters.adapters.keys())

['atLocation', 'isA', 'usedFor']

In [63]:
model.config

BertConfig {
  "_name_or_path": "../models/fusion_test/fusion_test/",
  "adapters": {
    "adapters": {
      "atLocation": "eff32a71a397d9d8",
      "isA": "eff32a71a397d9d8",
      "usedFor": "eff32a71a397d9d8"
    },
    "config_map": {
      "eff32a71a397d9d8": {
        "adapter_residual_before_ln": false,
        "cross_adapter": false,
        "inv_adapter": null,
        "inv_adapter_reduction_factor": null,
        "leave_out": [],
        "ln_after": false,
        "ln_before": false,
        "mh_adapter": true,
        "non_linearity": "swish",
        "original_ln_after": true,
        "original_ln_before": false,
        "output_adapter": true,
        "reduction_factor": 12,
        "residual_before_ln": true
      }
    },
    "fusion_config_map": {},
    "fusions": {
      "isA,usedFor,atLocation": "dynamic"
    }
  },
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
 

In [6]:
pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer, top_k=5)

In [11]:
"AdsdSASas".lower()

'adsdsasas'

In [9]:
pipe("It was a <mask> day")

[{'sequence': 'It was a beautiful day',
  'score': 0.2355542778968811,
  'token': 2721,
  'token_str': ' beautiful'},
 {'sequence': 'It was a good day',
  'score': 0.10204992443323135,
  'token': 205,
  'token_str': ' good'},
 {'sequence': 'It was a great day',
  'score': 0.05954313278198242,
  'token': 372,
  'token_str': ' great'},
 {'sequence': 'It was a nice day',
  'score': 0.051886625587940216,
  'token': 2579,
  'token_str': ' nice'},
 {'sequence': 'It was a long day',
  'score': 0.03153948113322258,
  'token': 251,
  'token_str': ' long'}]

In [15]:
adapter_config = AdapterConfig.load("houlsby", non_linearity="gelu", reduction_factor=12)
model.add_adapter("test_sondre", config=adapter_config)

In [9]:
model.active_adapters = None

In [20]:
111893562-109514298

2379264

In [21]:
2379264/111893562

0.021263636240304872

In [16]:
sum(p.numel() for p in model.parameters())

111893562

In [86]:
sum(p.numel() for p in model.base_model.parameters())

108891648

In [6]:
model.config.adapters.adapters

{'test_sondre': '5792d51e3161642d'}

In [None]:
model.train_adapter(["test_sondre"])

In [None]:
model.set_active_adapters(["test_sondre"])

In [None]:
model.freeze_model(False)

In [None]:
model

In [None]:
sum(p.numel() for p in model.parameters())

In [64]:
model.base_model.encoder.layer[1].output.adapter_fusion_layer[("isA,usedFor,atLocation")]

BertFusion(
  (dropout): Dropout(p=0.1, inplace=False)
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=False)
)

In [31]:
for param in model.base_model.encoder.layer[2].output.adapters.parameters():
    i = 0
    while i < 10:
        i += 1
        print(param.requires_grad)
    

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
