In [39]:
!pip install git+https://github.com/huggingface/peft.git
import json
import gzip
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
import torch.nn as nn
from tqdm import tqdm


# Load pretrained model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /private/var/folders/7f/vfs9n6rn3m706bkv3y_rsfk40000gn/T/pip-req-build-8wze1x2d
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /private/var/folders/7f/vfs9n6rn3m706bkv3y_rsfk40000gn/T/pip-req-build-8wze1x2d
  Resolved https://github.com/huggingface/peft.git to commit 2ee02af9d445cda0410c43a6e72160df59ad971c
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


loading configuration file config.json from cache at /Users/shubing/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.49.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/shubing/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file tokenizer.json from cache at /Users/shubing/.cache/huggingface/hub/models--dis

In [40]:
# =============================
# Dataset Preparation
# =============================
class RedTeamDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = f"### Task:\n{row['task_description']}\n\n### Transcript:\n{row['transcript']}"

        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item['rating'] = torch.tensor(row['rating'], dtype=torch.long)
        item['harmlessness_score'] = torch.tensor(row['task_descripton_harmlessness_score'], dtype=torch.float32)
        return item


# =============================
# Multi-task Loss
# =============================
class MultiTaskLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss_cls = nn.CrossEntropyLoss()
        self.loss_reg = nn.MSELoss()

    def forward(self, logits_cls, labels_cls, logits_reg, labels_reg):
        loss_c = self.loss_cls(logits_cls, labels_cls)
        loss_r = self.loss_reg(logits_reg.squeeze(), labels_reg)
        return 0.5 * loss_c + 0.5 * loss_r
    

# =============================
# Model Wrapper with Dual Head
# =============================
class DualHeadModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base = base_model
        hidden_size = base_model.config.hidden_size
        self.regression_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state[:, 0, :]
        logits_cls = outputs.logits
        logits_reg = self.regression_head(last_hidden)
        return logits_cls, logits_reg

In [41]:
def load_data(file_path):
    data = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        f.read(1)
        buffer = ""
        for line in f:
            line = line.strip()
            if line == "]":
                break
            if line.endswith(','):
                line = line[:-1]
            buffer += line
            try:
                record = json.loads(buffer)
                data.append(record)
                buffer = ""
            except json.JSONDecodeError:
                continue
    return pd.DataFrame(data)


file_path = "data/red-team-attempts/red_team_attempts.jsonl.gz"
df = load_data(file_path)

In [42]:
# =============================
# Training Setup
# =============================
from transformers import logging
logging.set_verbosity_info()

file_path = "data/red-team-attempts/red_team_attempts.jsonl.gz"
df = load_data(file_path)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
base_model = AutoModel.from_pretrained("bert-base-uncased")

peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

model_peft = get_peft_model(base_model, peft_config)
model = DualHeadModel(model_peft)

train_dataset = RedTeamDataset(df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = MultiTaskLoss()


loading configuration file config.json from cache at /Users/shubing/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/shubing/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.t

In [43]:
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


DualHeadModel(
  (base): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features

In [34]:
print(df.columns.tolist())

['transcript', 'min_harmlessness_score_transcript', 'num_params', 'model_type', 'rating', 'task_description', 'task_descripton_harmlessness_score', 'red_team_member_id', 'is_upworker', 'tags']


In [44]:
for epoch in range(3):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch")

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        rating = batch['rating'].to(device)
        harmlessness_score = batch['harmlessness_score'].to(device)

        logits_cls, logits_reg = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = loss_fn(logits_cls, rating, logits_reg, harmlessness_score)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} completed. Avg loss: {avg_loss:.4f}")

print("Training completed.")

Epoch 1:   0%|          | 0/2435 [00:11<?, ?batch/s]


AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'