In [1]:
from transformers import BertForSequenceClassification
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from tqdm import tqdm
import adapters
from adapters import init, AutoAdapterModel, BnConfig

In [3]:
model = torch.load('/data/zmengaf/5212/RCFDA/bert_model_arxiv_acc_0.8027633851468048.pt')
model.classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device).classifier
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [4]:
adapters.init(model)
config = BnConfig(mh_adapter=True, output_adapter=True, reduction_factor=96, non_linearity="relu")
model.add_adapter("arxiv", config=config)
model.train_adapter("arxiv")
model.to(device)
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttentionWithAdapters(
              (query): LoRALinear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (key): LoRALinear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (value): LoRALinear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
    

In [5]:
model.num_labels = 2

In [6]:
print(model.num_labels)

2


In [7]:
def print_trainable_status(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name + " is trainable")
        else:
            print(name + " is frozen")

In [8]:
print_trainable_status(model)

bert.embeddings.word_embeddings.weight is frozen
bert.embeddings.position_embeddings.weight is frozen
bert.embeddings.token_type_embeddings.weight is frozen
bert.embeddings.LayerNorm.weight is frozen
bert.embeddings.LayerNorm.bias is frozen
bert.encoder.layer.0.attention.self.query.weight is frozen
bert.encoder.layer.0.attention.self.query.bias is frozen
bert.encoder.layer.0.attention.self.key.weight is frozen
bert.encoder.layer.0.attention.self.key.bias is frozen
bert.encoder.layer.0.attention.self.value.weight is frozen
bert.encoder.layer.0.attention.self.value.bias is frozen
bert.encoder.layer.0.attention.output.dense.weight is frozen
bert.encoder.layer.0.attention.output.dense.bias is frozen
bert.encoder.layer.0.attention.output.LayerNorm.weight is frozen
bert.encoder.layer.0.attention.output.LayerNorm.bias is frozen
bert.encoder.layer.0.attention.output.adapters.arxiv.adapter_down.0.weight is trainable
bert.encoder.layer.0.attention.output.adapters.arxiv.adapter_down.0.bias is tra

In [9]:
dataset = load_dataset("imdb")
print(dataset)
imdb_test = dataset['test']
imdb_train = dataset['train']

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [10]:
# Bert-base-cased, tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Train on IMDB dataset
imdb_train_encodings = tokenizer(list(imdb_train['text']), truncation=True, padding=True)


In [11]:
imdb_train_labels = torch.tensor(list(imdb_train['label']))
imdb_train_dataset = TensorDataset(torch.tensor(imdb_train_encodings['input_ids']),
                                   torch.tensor(imdb_train_encodings['attention_mask']),
                                   imdb_train_labels)
imdb_train_loader = DataLoader(imdb_train_dataset, batch_size=16, shuffle=True)
print(imdb_train_labels)

tensor([0, 0, 0,  ..., 1, 1, 1])


In [12]:
# Test on IMDB dataset
imdb_test_encodings = tokenizer(list(imdb_test['text']), truncation=True, padding=True)
imdb_test_labels = torch.tensor(list(imdb_test['label']))
imdb_test_dataset = TensorDataset(torch.tensor(imdb_test_encodings['input_ids']),
                                  torch.tensor(imdb_test_encodings['attention_mask']),
                                  imdb_test_labels)
imdb_test_loader = DataLoader(imdb_test_dataset, batch_size=16, shuffle=False)

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)

model.to(device)
train_losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(10):
    model.train()
    train_loss = 0
    train_correct = 0
    for batch in imdb_train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        train_loss += loss.item()
        train_correct += (logits.argmax(dim=1) == labels).float().sum().item()
        loss.backward()
        optimizer.step()
    train_loss /= len(imdb_train_loader)
    train_accuracy = train_correct / len(imdb_train)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    model.eval()
    with torch.no_grad():
        test_correct = 0
        for batch in imdb_test_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            test_correct += (logits.argmax(dim=1) == labels).float().sum().item()
        test_accuracy = test_correct / len(imdb_test)
        test_accuracies.append(test_accuracy)
    print(f"Epoch {epoch + 1}: train_loss={train_loss:.4f}, train_accuracy={train_accuracy:.4f}, test_accuracy={test_accuracy:.4f}")

torch.save(model, f"adapter_bert_arxiv.pt")



Epoch 1: train_loss=0.6583, train_accuracy=0.6057, test_accuracy=0.7216
Epoch 2: train_loss=0.4238, train_accuracy=0.8128, test_accuracy=0.8628
Epoch 3: train_loss=0.3284, train_accuracy=0.8617, test_accuracy=0.8808
Epoch 4: train_loss=0.2999, train_accuracy=0.8779, test_accuracy=0.8856
Epoch 5: train_loss=0.2824, train_accuracy=0.8857, test_accuracy=0.8958
Epoch 6: train_loss=0.2703, train_accuracy=0.8901, test_accuracy=0.8981
Epoch 7: train_loss=0.2618, train_accuracy=0.8962, test_accuracy=0.9047
Epoch 8: train_loss=0.2538, train_accuracy=0.8979, test_accuracy=0.9076
Epoch 9: train_loss=0.2457, train_accuracy=0.9011, test_accuracy=0.9088
Epoch 10: train_loss=0.2405, train_accuracy=0.9063, test_accuracy=0.9100


PicklingError: Can't pickle <class 'abc.BertForSequenceClassification'>: attribute lookup BertForSequenceClassification on abc failed

In [15]:
# For the sake of this demonstration an example path for loading and storing is given below
save_path = os.path.join(os.getcwd(), "adapter_bert")

# Save models
model.save_pretrained(save_path)
# Save adapter
model.save_adapter(save_path, adapter_name="arxiv")

In [17]:
saved_model = AutoAdapterModel.from_pretrained(save_path)
saved_model.load_adapter(save_path)

for param in model.bert.parameters():
    param.requires_grad = False
model.eval()
with torch.no_grad():
    imdb_correct = 0
    with tqdm(total=len(imdb_test_loader), desc='testing') as pbar:
        for batch in imdb_test_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            imdb_correct += (logits.argmax(dim=1) == labels).float().sum().item()
            pbar.update(1)
        imdb_accuracy = imdb_correct / len(imdb_test)

print('imdb bert vanilla accuracy:', imdb_accuracy)

RuntimeError: Error(s) in loading state_dict for BertAdapterModel:
	size mismatch for heads.default.1.weight: copying a param with shape torch.Size([2, 768]) from checkpoint, the shape in current model is torch.Size([10, 768]).
	size mismatch for heads.default.1.bias: copying a param with shape torch.Size([2]) from checkpoint, the shape in current model is torch.Size([10]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
# Modify the last layer to perform 2 label classification
model.classifier = torch.nn.Linear(768, 2)
# Freeze the parameters of the pre-trained layers

In [None]:
# Use the modified model for 2 label classification
outputs = model(input_ids, attention_mask)