In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "9" #6,7
from transformers import (
    PreTrainedModel,
    RobertaPreTrainedModel, 
    RobertaModel, 
    PretrainedConfig, 
    RobertaConfig, 
    RobertaForSequenceClassification,
    AutoModelForSequenceClassification,AutoModelForCausalLM,AutoModelForMaskedLM, 
    AutoConfig, 
    AutoModel,
    AutoTokenizer,
    RobertaPreTrainedModel,
    BertForPreTraining,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)

from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    BestRun,
    EvalLoopOutput,
    EvalPrediction,
    FSDPOption,
    HPSearchBackend,
    HubStrategy,
    IntervalStrategy,
    PredictionOutput,
    RemoveColumnsCollator,
    ShardedDDPOption,
    TrainerMemoryTracker,
    TrainOutput,
    default_compute_objective,
    default_hp_space,
    denumpify_detensorize,
    enable_full_determinism,
    find_executable_batch_size,
    get_last_checkpoint,
    has_length,
    number_of_arguments,
    seed_worker,
    set_seed,
    speed_metrics,
)

from transformers.utils import logging

from transformers.configuration_utils import PretrainedConfig
from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
from transformers.dependency_versions_check import dep_version_check
from transformers.modelcard import TrainingSummary
from transformers.modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
from transformers.optimization import Adafactor, get_scheduler
# from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_10, is_torch_less_than_1_11
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_callback import (
    CallbackHandler,
    DefaultFlowCallback,
    PrinterCallback,
    ProgressCallback,
    TrainerCallback,
    TrainerControl,
    TrainerState,
)
from transformers.trainer_pt_utils import (
    DistributedLengthGroupedSampler,
    DistributedSamplerWithLoop,
    DistributedTensorGatherer,
    IterableDatasetShard,
    LabelSmoother,
    LengthGroupedSampler,
    SequentialDistributedSampler,
    ShardSampler,
    distributed_broadcast_scalars,
    distributed_concat,
    find_batch_size,
    get_module_class_from_name,
    get_parameter_names,
    nested_concat,
    nested_detach,
    nested_numpify,
    nested_truncate,
    nested_xla_mesh_reduce,
    reissue_pt_warnings,
)
import time

from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler


# from transformers.utils.generic import can_return_loss
# custom
from mlm_contrastive_transformer import TransformerForPreTraining

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from datasets import load_dataset, load_metric # list_datasets, load_from_disk, DatasetDict, Dataset, load_dataset_builder
import evaluate # this weirdly loads something onto the GPU and will cause OOM on python3.9
import pandas as pd
from tqdm import tqdm
import numpy as np
import copy
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
from sentence_transformers.losses import CosineSimilarityLoss


from utils.custom_hf_trainer import CustomHFTrainer


from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PeftConfig,
    PeftModel,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
    prepare_model_for_int8_training,
    # AutoPeftModel,
    prepare_model_for_kbit_training # only for latest dev version of peft
)

In [17]:
from transformers.trainer_utils import EvalPrediction

In [6]:
result = EvalPrediction([1],[1])

In [11]:
result.predictions

[1]

### first lets look at getting dataset in a suitable format

we simply need to create a dataset that can be used for both MLM and contrastive loss calculation. This actually may work well with HF new setfit library: https://github.com/huggingface/setfit, although we want to extend this to include MLM


In [2]:
# data_dir = "/mnt/sdg/niallt/mimic_iii/processed/HADM_ID_split/pseudo_classification/class_reduced_8/fewshot_16/"
data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/"



In [3]:
# load as a dataset
dataset = load_dataset("csv", 
                        data_files = {"train":f"{data_dir}/lm_pretraining_train_250000.csv",
                                        "valid":f"{data_dir}/lm_pretraining_test_1000.csv"},
                        cache_dir = "/mnt/sdc/niallt/.cache/")

Found cached dataset csv (/mnt/sdc/niallt/.cache/csv/default-e82d422e95c8b033/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
dataset['valid']

Dataset({
    features: ['TEXT', 'CATEGORY', 'label'],
    num_rows: 1000
})

In [7]:
dataset["train"][0]

{'TEXT': '9:27 pm ct low ext w/o c left; ct low ext w/o c right clip # ct reconstruction reason: please evaluate bilateral lower leg, ankles, feet for trauma admitting diagnosis: s/p fall medical condition: 37 year old man s/p fall from 70 feet reason for this examination: please evaluate bilateral lower leg, ankles, feet for trauma no contraindications for iv contrast final report indication: status post fall from 70 feet with bilateral foot fractures. technique: helically acquired contiguous axial images were obtained from the distal tibia and fibula through the proximal forefoot in both the right and left feet. coronal and sagittal reformatted images were performed for both feet. ct right foot with sagittal and coronal reformats: there is a markedly comminuted fracture involving the entire body of the calcaneus. in addition, the ankle mortise is completely disrupted with fracture through the neck of the talus and inferior medial displacement of the talar body. the talar body is also

In [6]:
dataset['train']

Dataset({
    features: ['TEXT', 'CATEGORY', 'label'],
    num_rows: 249994
})

In [4]:
dataset = dataset.rename_column('label', 'category_label')

In [9]:
dataset["train"][0]

{'TEXT': '9:27 pm ct low ext w/o c left; ct low ext w/o c right clip # ct reconstruction reason: please evaluate bilateral lower leg, ankles, feet for trauma admitting diagnosis: s/p fall medical condition: 37 year old man s/p fall from 70 feet reason for this examination: please evaluate bilateral lower leg, ankles, feet for trauma no contraindications for iv contrast final report indication: status post fall from 70 feet with bilateral foot fractures. technique: helically acquired contiguous axial images were obtained from the distal tibia and fibula through the proximal forefoot in both the right and left feet. coronal and sagittal reformatted images were performed for both feet. ct right foot with sagittal and coronal reformats: there is a markedly comminuted fracture involving the entire body of the calcaneus. in addition, the ankle mortise is completely disrupted with fracture through the neck of the talus and inferior medial displacement of the talar body. the talar body is also

In [19]:
# first we can try just modelling the task similar to that of the next sentence prediction task

tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = BertForPreTraining.from_pretrained('bert-base-uncased')
model = TransformerForPreTraining.from_pretrained('roberta-base', compute_contrastive = True)
normal_model = RobertaModel.from_pretrained('roberta-base')
# model = AutoModelForMaskedLM.from_pretrained('roberta-base')

sentence_A = "The sun is a huge ball of gases. It has a diameter of 1,392,000 km."



Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/niallt/.cache/hugg

In [29]:
# function for counting trainable params
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# function to unfreeze all layers of a model

def unfreeze_model(model):
    for param in model.parameters():
        param.requires_grad = True

In [4]:
count_parameters(normal_model)

124645632

In [20]:
count_parameters(model)

124706661

In [21]:
count_parameters(model.seq_classifier)

9228

In [6]:
model.config

MeanRobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "compute_contrastive": true,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [7]:
model.compute_contrastive

True

### sanity check the re-loading of pre-trained model with autoclasses

In [5]:
model_name_or_path = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-note-custom_pretraining_max_epoch_2_weighted/sampled_250000/07-07-2023--08-30/checkpoint-30000/"
model = TransformerForPreTraining.from_pretrained(model_name_or_path)
normal_model = AutoModel.from_pretrained(model_name_or_path)
auto_seq_model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)


loading configuration file /mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-note-custom_pretraining_max_epoch_2_weighted/sampled_250000/07-07-2023--08-30/checkpoint-30000/config.json
Model config MeanRobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "TransformerForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "compute_contrastive": false,
  "contrastive_loss_weight": 0.1,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_pretraining_labels": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_

In [3]:
model

TransformerForPreTraining(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [4]:
normal_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [10]:
model.roberta.encoder.layer[0].attention.self.value.weight

Parameter containing:
tensor([[-0.0105,  0.0309, -0.0421,  ..., -0.0065, -0.0530,  0.0125],
        [ 0.0153,  0.0162, -0.0054,  ...,  0.0245,  0.0304, -0.0195],
        [ 0.0048,  0.0226,  0.0373,  ..., -0.0535, -0.0157, -0.0376],
        ...,
        [ 0.0049, -0.0030, -0.0031,  ...,  0.0020, -0.0003,  0.0297],
        [-0.0070, -0.0124,  0.0054,  ..., -0.0137, -0.0105,  0.0135],
        [ 0.0169,  0.0085,  0.0334,  ...,  0.0027,  0.0008,  0.0009]],
       requires_grad=True)

In [12]:
normal_model.encoder.layer[0].attention.self.value.weight

Parameter containing:
tensor([[-0.0105,  0.0309, -0.0421,  ..., -0.0065, -0.0530,  0.0125],
        [ 0.0153,  0.0162, -0.0054,  ...,  0.0245,  0.0304, -0.0195],
        [ 0.0048,  0.0226,  0.0373,  ..., -0.0535, -0.0157, -0.0376],
        ...,
        [ 0.0049, -0.0030, -0.0031,  ...,  0.0020, -0.0003,  0.0297],
        [-0.0070, -0.0124,  0.0054,  ..., -0.0137, -0.0105,  0.0135],
        [ 0.0169,  0.0085,  0.0334,  ...,  0.0027,  0.0008,  0.0009]],
       requires_grad=True)

In [9]:
auto_seq_model.roberta.encoder.layer[0].attention.self.value.weight

Parameter containing:
tensor([[-0.0105,  0.0309, -0.0421,  ..., -0.0065, -0.0530,  0.0125],
        [ 0.0153,  0.0162, -0.0054,  ...,  0.0245,  0.0304, -0.0195],
        [ 0.0048,  0.0226,  0.0373,  ..., -0.0535, -0.0157, -0.0376],
        ...,
        [ 0.0049, -0.0030, -0.0031,  ...,  0.0020, -0.0003,  0.0297],
        [-0.0070, -0.0124,  0.0054,  ..., -0.0137, -0.0105,  0.0135],
        [ 0.0169,  0.0085,  0.0334,  ...,  0.0027,  0.0008,  0.0009]],
       requires_grad=True)

# Peft

In [23]:
normal_model.encoder.layer[0].attention.self.key.weight

Parameter containing:
tensor([[ 0.0856,  0.0398, -0.1010,  ..., -0.1160, -0.0837, -0.1936],
        [ 0.0541, -0.2281,  0.1569,  ..., -0.1597,  0.0917, -0.0688],
        [-0.0638,  0.0168, -0.0046,  ..., -0.0151,  0.0552, -0.1130],
        ...,
        [ 0.1296, -0.0342,  0.0183,  ..., -0.0726, -0.0778, -0.0231],
        [ 0.1559,  0.1121, -0.0817,  ..., -0.0249,  0.0630, -0.2340],
        [ 0.0315,  0.0698, -0.0776,  ...,  0.1595,  0.0075,  0.0934]],
       requires_grad=True)

In [6]:
peft_type = PeftType.LORA
lr = 3e-4
peft_config = LoraConfig(task_type=None, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

In [22]:
# loroberta  = get_peft_model(normal_model, peft_config)
peft_roberta = get_peft_model(model, peft_config)

In [22]:
model.base_model.model.roberta.encoder.layer[0].attention.self.key.weight

Parameter containing:
tensor([[ 0.0856,  0.0398, -0.1010,  ..., -0.1160, -0.0837, -0.1936],
        [ 0.0541, -0.2281,  0.1569,  ..., -0.1597,  0.0917, -0.0688],
        [-0.0638,  0.0168, -0.0046,  ..., -0.0151,  0.0552, -0.1130],
        ...,
        [ 0.1296, -0.0342,  0.0183,  ..., -0.0726, -0.0778, -0.0231],
        [ 0.1559,  0.1121, -0.0817,  ..., -0.0249,  0.0630, -0.2340],
        [ 0.0315,  0.0698, -0.0776,  ...,  0.1595,  0.0075,  0.0934]])

In [23]:
peft_roberta.print_trainable_parameters()

trainable params: 294,912 || all params: 125,001,573 || trainable%: 0.23592663109927425


In [30]:
count_parameters(peft_roberta)

294912

In [32]:
peft_roberta.base_model.seq_classifier


MeanSequenceClassifier(
  (classifier): Linear(in_features=768, out_features=12, bias=True)
)

In [33]:
# need to unfreeze the classifier

unfreeze_model(peft_roberta.base_model.seq_classifier)

In [34]:
count_parameters(peft_roberta)

304140

In [35]:
peft_roberta.print_trainable_parameters()

trainable params: 304,140 || all params: 125,001,573 || trainable%: 0.24330893820032168


In [18]:
normal_model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [9]:
tokenized = tokenizer(sentence_A, return_tensors='pt')
tokenized.keys()


dict_keys(['input_ids', 'attention_mask'])

In [8]:
tokenized

{'input_ids': tensor([[    0,   133,  3778,    16,    10,  1307,  1011,     9, 20038,     4,
            85,    34,    10, 26089,     9,   112,     6, 36350,     6,   151,
          6301,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
labels = torch.LongTensor([0])
labels


tensor([0])

In [9]:
model

PeftModel(
  (base_model): LoraModel(
    (model): TransformerForPreTraining(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=76

In [9]:
outputs = model(**tokenized, category_label = labels)

outputs:
 BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0774,  0.1109, -0.0291,  ..., -0.0546, -0.0372, -0.0463],
         [-0.0203, -0.0831,  0.2454,  ..., -0.0613,  0.1672, -0.1087],
         [-0.0317, -0.0290,  0.0410,  ...,  0.0523, -0.1697, -0.0263],
         ...,
         [-0.0052,  0.3558, -0.0670,  ..., -0.1990,  0.0941,  0.0300],
         [-0.0686,  0.1020, -0.0595,  ..., -0.0820, -0.0333, -0.0822],
         [-0.0264,  0.0509,  0.0005,  ...,  0.1742, -0.0074, -0.0330]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=None, hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
prior to being sent to the classifier the shape of sequence output is: torch.Size([1, 23, 768]), and pooled output is: torch.Size([1, 768])


In [10]:
outputs

TransformerForPreTrainingOutput([('loss', tensor(0., grad_fn=<SumBackward0>)),
                                 ('mlm_loss', None),
                                 ('cls_loss',
                                  tensor(0., grad_fn=<SumBackward0>)),
                                 ('prediction_logits',
                                  tensor([[[34.6068, -3.8174, 18.3210,  ...,  2.9154,  5.9354, 11.6600],
                                           [ 5.3773, -3.3239, 18.0196,  ...,  1.6447,  3.2221,  6.7053],
                                           [-2.3307, -4.8697,  6.7838,  ..., -5.5666, -5.9032, -1.2778],
                                           ...,
                                           [-0.1162, -3.9556, 11.1376,  ..., -1.6905, -2.2702,  1.9070],
                                           [18.5936, -4.3268, 19.8805,  ...,  0.9715,  3.8395,  7.2003],
                                           [14.5368, -4.1192, 31.6962,  ...,  0.1742, -1.7078,  9.5063]]],
                

In [6]:
normal_outputs = normal_model(**tokenized)

In [7]:
normal_outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0774,  0.1109, -0.0291,  ..., -0.0546, -0.0372, -0.0463],
         [-0.0203, -0.0831,  0.2454,  ..., -0.0613,  0.1672, -0.1087],
         [-0.0317, -0.0290,  0.0410,  ...,  0.0523, -0.1697, -0.0263],
         ...,
         [-0.0052,  0.3558, -0.0670,  ..., -0.1990,  0.0941,  0.0300],
         [-0.0686,  0.1020, -0.0595,  ..., -0.0820, -0.0333, -0.0822],
         [-0.0264,  0.0509,  0.0005,  ...,  0.1742, -0.0074, -0.0330]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.2005e-02, -2.2570e-01, -2.0211e-01, -8.3373e-02,  1.5201e-01,
          2.0418e-01,  2.6802e-01, -5.5586e-02, -9.9229e-02, -1.8181e-01,
          2.8494e-01,  5.7336e-03, -1.2715e-01,  1.4829e-01, -1.4396e-01,
          4.7950e-01,  1.9732e-01, -5.0816e-01,  3.4845e-02, -2.8383e-02,
         -2.7479e-01,  7.1890e-02,  4.8310e-01,  3.5491e-01,  1.0504e-01,
          4.9484e-02, -1.5675e-01, -2.3135e-02,  1.6638e-01,  2.640

now just need to ensure can batch data properly

In [8]:
def tokenize_function(examples):
    '''
    Function to return a tokenized version of the input text

    args:
        examples: datasets object obtained via load_datasets. 

    returns:
        dictionary of tokenized inputs with appropriate input_ids, attention_mask etc.
    '''
    return tokenizer(examples["TEXT"], truncation=True, padding = True)

def preprocess_function(examples):
    
    examples['labels'] = examples['input_ids'].copy()
    return examples
    

def group_texts(tokenized_examples, block_size = 512):
        '''
        Function to concatenate all texts together then split the result into smaller chunks of a specified block_size

        args:
            examples: tokenized dataset produced by the tokenizer_function
            block_size: int -> the chunk or block_size to divide the full concatenated text into
        '''
        examples = tokenized_examples.copy()
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # can use the following line to cut off tails
        total_length = (total_length // block_size) * block_size
        result = {
            k: [t[i:i+block_size] for i in range(0,total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        # for both causal and masked language modelling the "right shift" of input text is done by the model internally. Thus for now, labels=input_ids
        result['labels'] = result['input_ids'].copy()
        
        return result

In [9]:
# subsample the dataset
dataset["train"] = dataset["train"].select(range(1000))

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'CATEGORY', 'category_label'],
        num_rows: 1000
    })
    valid: Dataset({
        features: ['TEXT', 'CATEGORY', 'category_label'],
        num_rows: 1000
    })
})

In [10]:
sentence1_key = "TEXT"

encoded_dataset = dataset.map(tokenize_function, batched=True, remove_columns = ['TEXT', 'CATEGORY'])

Loading cached processed dataset at /mnt/sdc/niallt/.cache/csv/default-e82d422e95c8b033/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-754901f8e2dae3bb.arrow


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
lm_datasets  = encoded_dataset.map(preprocess_function, batched = True, batch_size = 1000)

Loading cached processed dataset at /mnt/sdc/niallt/.cache/csv/default-e82d422e95c8b033/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-e73aa4be21fbee76.arrow


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['category_label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    valid: Dataset({
        features: ['category_label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [20]:
lm_datasets['train']

Dataset({
    features: ['category_label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [12]:

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [22]:
model

PeftModel(
  (base_model): LoraModel(
    (model): TransformerForPreTraining(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=76

In [24]:
for i in lm_datasets['train']:
    # print(i)
    # print(data_collator([i]))
    batch = data_collator([i])
    print(batch)
    print(batch['input_ids'].shape)
    outputs = model(**batch)
    
    break

{'category_label': tensor([1]), 'input_ids': tensor([[    0,   466,    35,  2518,  4751,   740,    90,   614,  8935,   885,
            73,   139,   740,   314,   131,   740,    90,   614,  8935,   885,
            73,   139, 50264,   235, 50264,   849,   740,    90, 18228,  1219,
            35,  2540, 10516,  9526,   795,  2985,     6, 35713, 50264,  1730,
            13,  8795, 13874,  9726,    35,   579,    73,   642,  1136,  1131,
          1881, 43767,  2908,    76,   793,   313,   579,    73,   642,  1136,
            31,  1510,  1730,  1219,    13,    42,  9027, 50264,  2540, 10516,
          9526,   795,  2985,     6, 35713, 50264, 50264,    13,  8795,   117,
          8541, 50264,   417, 29758,    13, 40436,  5709,   507,   266,  7335,
            35,  2194,   618,  1136,    31,  1510,  1730,    19,  9526, 50264,
         28456,     4,  9205,    35, 50264,  2368,  3566, 41402, 18884,  2617,
          3156,    58,  4756,    31,     5,  7018,   337, 50264, 18739,     8,
       

In [25]:
outputs

TransformerForPreTrainingOutput([('loss',
                                  tensor(1.9225, grad_fn=<AddBackward0>)),
                                 ('mlm_loss',
                                  tensor(1.9225, grad_fn=<NllLossBackward0>)),
                                 ('cls_loss',
                                  tensor(0., grad_fn=<SumBackward0>)),
                                 ('prediction_logits',
                                  tensor([[[33.5637, -4.1026, 15.1689,  ...,  2.7973,  4.6324, 11.7745],
                                           [ 3.3638, -3.9661, 15.9210,  ...,  0.3801, -1.7152,  3.6887],
                                           [-2.6174, -4.3964, 14.8724,  ..., -1.6238, -2.5521,  1.4594],
                                           ...,
                                           [14.1691, -4.4073, 28.0812,  ..., -0.5447, -3.0653,  9.6223],
                                           [14.1691, -4.4073, 28.0812,  ..., -0.5447, -3.0653,  9.6223],
             

In [11]:
outputs.seq_embedding.shape

torch.Size([1, 768])

In [24]:
# look at freezing then unfreezing certain layers

model

PeftModel(
  (base_model): LoraModel(
    (model): TransformerForPreTraining(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=76

In [11]:
for name, param in model.base_model.named_modules():
    
    print(f"Name is: {name}")
    # for name, layer in model.base_model.encoder.layer[:13].named_modules():
    #     print(f"layer is: {name}")

Name is: 
Name is: embeddings
Name is: embeddings.word_embeddings
Name is: embeddings.position_embeddings
Name is: embeddings.token_type_embeddings
Name is: embeddings.LayerNorm
Name is: embeddings.dropout
Name is: encoder
Name is: encoder.layer
Name is: encoder.layer.0
Name is: encoder.layer.0.attention
Name is: encoder.layer.0.attention.self
Name is: encoder.layer.0.attention.self.query
Name is: encoder.layer.0.attention.self.key
Name is: encoder.layer.0.attention.self.value
Name is: encoder.layer.0.attention.self.dropout
Name is: encoder.layer.0.attention.output
Name is: encoder.layer.0.attention.output.dense
Name is: encoder.layer.0.attention.output.LayerNorm
Name is: encoder.layer.0.attention.output.dropout
Name is: encoder.layer.0.intermediate
Name is: encoder.layer.0.intermediate.dense
Name is: encoder.layer.0.intermediate.intermediate_act_fn
Name is: encoder.layer.0.output
Name is: encoder.layer.0.output.dense
Name is: encoder.layer.0.output.LayerNorm
Name is: encoder.layer.0.o

In [4]:
model.config

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

### test training 

In [13]:
# set up training arguments
training_args = TrainingArguments(
    output_dir=f"/mnt/sdd/niallt/saved_models/code_testing/note_mlm/",
    max_steps= 10,
    num_train_epochs=5,
    
    # per_device_train_batch_size=args.train_batch_size, # seems auto handeled by HF trainer now
    # per_device_eval_batch_size = args.eval_batch_size,

    learning_rate = 2e-5,
    weight_decay = 0.01,
    
    evaluation_strategy = "steps",
    eval_steps = 10,
    save_strategy="no",        
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end = False,
    
    warmup_steps=5,
    gradient_accumulation_steps=1,
    logging_steps=5,
    logging_first_step=True,
    logging_strategy = 'steps',
    logging_dir = f"/mnt/sdd/niallt/saved_models/code_testing/note_mlm/",
    remove_unused_columns=True
)

# set up the trainer
trainer = CustomHFTrainer(
    model=model,
    tokenizer = tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
    eval_dataset = lm_datasets['valid'],
   
    # compute_metrics = compute_metrics,
    # preprocess_logits_for_metrics = preprocess_logits_for_metrics
)
# run trainer
trainer.train()

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set don't have a corresponding argument in `PeftModel.forward` and have been ignored: attention_mask, input_ids, labels, category_label. If attention_mask, input_ids, labels, category_label are not expected by `PeftModel.forward`,  you can safely ignore this message.


IndexError: Invalid key: 858 is out of bounds for size 0

AttributeError: 'TransformerForPreTraining' object has no attribute 'return_dict'

#### Test contrastive loss functions

In [50]:
from pytorch_metric_learning import losses
# loss_func = losses.TripletMarginLoss()
# loss_func = losses.SupConLoss(temperature=0.1)
loss_func = losses.NTXentLoss(temperature=0.1)

In [None]:
train_loader = torch.utils.data.DataLoader(
    lm_datasets['train'],  collate_fn = data_collator,batch_size=8, shuffle=False
)

In [54]:
for batch in train_loader:
    model.eval()
    outputs = model(**batch)
    seq_embeddings = outputs.seq_embedding
    print(f"labels are: {batch['category_label']}")
    loss = loss_func(seq_embeddings, batch['category_label'])
    break
    

labels are: tensor([1, 0, 3, 0, 0, 0, 0, 8])


In [52]:
loss

tensor(1.2314, grad_fn=<MeanBackward0>)

In [55]:
labels = [1,2,2,2,1,3,4,5,6,7,7,8,8]

In [57]:
num_classes = np.unique(labels)
label_to_idx = {x: i for i, x in enumerate(num_classes)}
positive_idxs = [np.where(labels == i)[0] for i in num_classes]
negative_idxs = [np.where(labels != i)[0] for i in num_classes]

In [58]:
label_to_idx

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}

In [59]:
positive_idxs

[array([0, 4]),
 array([1, 2, 3]),
 array([5]),
 array([6]),
 array([7]),
 array([8]),
 array([ 9, 10]),
 array([11, 12])]

In [60]:
negative_idxs

[array([ 1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12]),
 array([ 0,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
 array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12]),
 array([ 0,  1,  2,  3,  4,  5,  7,  8,  9, 10, 11, 12]),
 array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 11, 12]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])]

In [30]:
# encoder_model = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/mimic-roberta-base/sampled_250000/22-12-2022--12-45/checkpoint-100000"
encoder_model = "/mnt/sdc/niallt/saved_models/declutr/mimic/few_epoch/mimic-roberta-base/2_anch_2_pos_min_1024/transformer_format/"

In [31]:

 
if "saved_models" in encoder_model:
    if "declutr" in encoder_model:
        if "few_epoch" in encoder_model:
            if "span_comparison" in encoder_model:
                model_name = encoder_model.split("/")[9] + "/declutr/" + encoder_model.split("/")[-3]
            else:
                model_name = encoder_model.split("/")[8] + "/declutr/" + encoder_model.split("/")[-3]

        else:
            model_name = encoder_model.split("/")[7] + "/declutr/" + encoder_model.split("/")[-3]
    elif "contrastive" in encoder_model or "custom_pretraining" in encoder_model:
        print("contrastive or custom_pretraining")
        model_name = encoder_model.split("/")[7]
    else:
        print("mlm only")
        model_name = encoder_model.split("/")[7] + "/mlm_only/"
else:    
    model_name = encoder_model.split("/")[-1]

In [29]:
encoder_model.split("/")

['',
 'mnt',
 'sdc',
 'niallt',
 'saved_models',
 'declutr',
 'mimic',
 'few_epoch',
 'mimic-roberta-base',
 '2_anch_2_pos_min_1024',
 'transformer_format']

In [32]:
model_name

'mimic-roberta-base/declutr/2_anch_2_pos_min_1024'

## Test PEFT

In [15]:
# reload trained model

# peft_model_dir = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-note-custom_pretraining_max_epoch_1_lora/sampled_250000/19-07-2023--16-49/checkpoint-14000/"
peft_model_dir = "/mnt/sdc/niallt/saved_models/language_modelling/mimic/roberta-base-mimic-wecho-LORA/sampled_250000/28-08-2023--12-46/checkpoint-78000"
encoder_model = "roberta-base"
# load config
config = PeftConfig.from_pretrained(peft_model_dir)
# load base model 
original_model = AutoModelForMaskedLM.from_pretrained(encoder_model)
# original_model = TransformerForPreTraining.from_pretrained(config.base_model_name_or_path)
# load peft model
reloaded_peft_model = PeftModel.from_pretrained(original_model, peft_model_dir)



loading configuration file config.json from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e22407

In [17]:
config

PeftConfig(peft_type='LORA', auto_mapping={'base_model_class': 'RobertaForMaskedLM', 'parent_library': 'transformers.models.roberta.modeling_roberta'}, base_model_name_or_path='roberta-base', revision=None, task_type=None, inference_mode=True)

In [19]:
reloaded_peft_model

PeftModel(
  (base_model): LoraModel(
    (model): RobertaForMaskedLM(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_

In [5]:
count_parameters(reloaded_peft_model)

In [4]:
# using pretrained class

reloaded_pretrained_model = PeftModel.from_pretrained(TransformerForPreTraining.from_pretrained("roberta-base"), peft_model_dir)

loading configuration file config.json from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config MeanRobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "compute_contrastive": null,
  "contrastive_loss_weight": 1.0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_pretraining_labels": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/niallt/.cache/huggingface/hub/

In [24]:
# view lora weights of model


reloaded_pretrained_model.base_model.model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight

Parameter containing:
tensor([[-0.0366, -0.0454,  0.0340,  ...,  0.0209, -0.0275,  0.0043],
        [-0.0124, -0.0309, -0.0112,  ...,  0.0028, -0.0017,  0.0118],
        [-0.0257,  0.0028,  0.0073,  ..., -0.0129, -0.0434,  0.0093],
        ...,
        [ 0.0148,  0.0063,  0.0246,  ...,  0.0279, -0.0199, -0.0034],
        [-0.0222,  0.0434,  0.0112,  ..., -0.0060, -0.0374, -0.0553],
        [-0.0138, -0.0138, -0.0105,  ...,  0.0176,  0.0340,  0.0087]])

In [26]:
# merge weights?

merged_model = reloaded_peft_model.merge_and_unload()

In [27]:
merged_model

TransformerForPreTraining(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

#### save merged model

In [None]:
merged_model.save_pretrained()

In [28]:
reloaded_peft_model.print_trainable_parameters()

trainable params: 0 || all params: 124,706,661 || trainable%: 0.0


In [34]:
# can we also reload this model into the automodel classes directly?


# load base model 
original_auto_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
# load peft model
reloaded_auto_peft_model = PeftModel.from_pretrained(original_auto_model, peft_model_dir)

loading configuration file config.json from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e22407

In [35]:
reloaded_auto_peft_model

PeftModel(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_feat

In [37]:
reloaded_auto_peft_model.base_model.model.roberta.encoder.layer[0].attention.self.query.lora_A.default.weight

Parameter containing:
tensor([[-0.0366, -0.0454,  0.0340,  ...,  0.0209, -0.0275,  0.0043],
        [-0.0124, -0.0309, -0.0112,  ...,  0.0028, -0.0017,  0.0118],
        [-0.0257,  0.0028,  0.0073,  ..., -0.0129, -0.0434,  0.0093],
        ...,
        [ 0.0148,  0.0063,  0.0246,  ...,  0.0279, -0.0199, -0.0034],
        [-0.0222,  0.0434,  0.0112,  ..., -0.0060, -0.0374, -0.0553],
        [-0.0138, -0.0138, -0.0105,  ...,  0.0176,  0.0340,  0.0087]])

In [38]:
merged_auto_model = reloaded_auto_peft_model.merge_and_unload()

In [41]:
merged_auto_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [42]:
count_parameters(merged_auto_model)

0

In [44]:
auto_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")

loading configuration file config.json from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/niallt/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e22407

In [47]:
auto_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [51]:
count_parameters(auto_model)

124647170

In [14]:
name = "roberta-base-lora"
if "LORA" in name or "lora" in name:
    print(f"dfsad")

dfsad


In [49]:
unfreeze_model(merged_auto_model)

In [50]:
count_parameters(merged_auto_model)

124647170