
Fine-tuning BertModel.from_pretrained("bert-base-uncased") (BERT)

amber pan
08/15/2025

In [None]:
!pip install transformers torch -q

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

from torch.optim import AdamW

In [None]:
# Sample record-level training data
record_data = pd.DataFrame({
    "record_id": [0, 1, 2],
    # "transcript": ["I like cat", "You are [pause] funny", "It rained"],
    "transcript": ["I like cats", "You are funny", "It rained"],
    "humor_type": [3, 1, 2],
    "humor_score": [2.0, 3.0, 1.0],
})

audio_features = pd.DataFrame({
    "record_id": [0, 1, 2],
    "audio_feature1": [0.08, 0.51, 0.17],
    "audio_feature2": [0.51, 0.19, 0.67]
})

# Merge on record_id
df = pd.merge(record_data, audio_features, on="record_id")
df


In [None]:
# tokenizes the text and prepares the audio features
class HumorDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=32):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inputs = self.tokenizer(
            row["transcript"],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        audio_feats = torch.tensor([[row["audio_feature1"], row["audio_feature2"]]], dtype=torch.float)

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "audio_feats": audio_feats,
            "score": torch.tensor(row["humor_score"], dtype=torch.float)
        }

In [None]:
#The code defines a multimodal deep learning model for humor regression that processes both textual data (from BERT) and audio features (numerical data).
# The goal of the model is to predict a humor score based on these two input modalities.

#This class is used to perform cross-modal attention, where the model learns to attend to both textual and audio features simultaneously.
class CrossModalAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=4, batch_first=True)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, query, key_value):
        attn_output, _ = self.attn(query, key_value, key_value)
        return self.norm(query + attn_output)


class HumorRegressor(nn.Module):
    def __init__(self, text_hidden_size=768, audio_feat_size=2, fusion_dim=128):
        super().__init__()
        # Load pre-trained BERT model
        self.text_model = BertModel.from_pretrained("bert-base-uncased")

        # Projection layer for audio features to match the hidden size of text model
        self.audio_proj = nn.Linear(audio_feat_size, text_hidden_size)

        # Cross-modal attention layer (if needed)
        self.cross_attn = CrossModalAttention(text_hidden_size)

        # Fusion layer to combine text and audio features
        self.fusion = nn.Linear(text_hidden_size + audio_feat_size, fusion_dim)  # +audio_feat_size to match the concatenated size

        # Final regressor layer
        self.regressor = nn.Linear(fusion_dim, 1)



    def forward(self, input_ids, attention_mask, audio_feats):
      # Pass through the BERT model
      text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
      text_hidden = text_outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

      # Extract the [CLS] token (first token in the sequence) for pooling
      pooled = text_hidden[:, 0, :]  # shape: (batch_size, hidden_size)

      # Ensure audio_feats has shape (batch_size, 1, 2) and remove extra dimensions
      audio_feats = audio_feats.squeeze(1)  # shape: (batch_size, 2) now

      # Concatenate the [CLS] token with the audio features
      merged_features = torch.cat((pooled, audio_feats), dim=1)  # shape: (batch_size, hidden_size + 2)

      # Apply the fusion layer
      fused_hidden = self.fusion(merged_features)  # shape: (batch_size, fusion_dim)

      # Predict the humor score using the regressor
      score = self.regressor(fused_hidden)  # shape: (batch_size, 1)

      return score.squeeze(1)  # return a 1D tensor (batch_size,)


In [None]:
# ----------- Step 4: Training Loop ----------------

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        audio_feats = batch["audio_feats"].to(device)
        labels = batch["score"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, audio_feats)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
# ----------- Step 5: Setup and Run ----------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = HumorDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:

model = HumorRegressor().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()

model.parameters()

In [None]:
# Train for a few epochs
for epoch in range(3):
    loss = train(model, dataloader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


Epoch 1, Loss: 4.8145
Epoch 2, Loss: 2.6573
Epoch 3, Loss: 1.3137


In [None]:
# Sample record-level validation data
record_data_val = pd.DataFrame({
    "record_id": [0, 1],
    "transcript": ["I like dogs", "He is funny"],

    "humor_type": [3, 1],
    "humor_score": [2.0, 3.0]
})

audio_features_val = pd.DataFrame({
    "record_id": [0, 1],
    "audio_feature1": [0.08, 0.51],
    "audio_feature2": [0.51, 0.19]
})


In [None]:
# Model evaluation
from sklearn.metrics import mean_squared_error, r2_score
import torch

# Create a dataset for the validation set in the same way as training data
df_val = pd.merge(record_data_val, audio_features_val, on="record_id")
val_dataset = HumorDataset(df_val, tokenizer)

# Create a DataLoader for validation
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Evaluate the model's performance on validation set
model.eval()  # Set model to evaluation mode

true_labels = []
predicted_labels = []

with torch.no_grad():  # Disable gradient calculation during evaluation
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        audio_feats = batch["audio_feats"].to(device)
        labels = batch["score"].to(device)  # Ground truth labels (humor score)

        # Get the model's predictions
        outputs = model(input_ids, attention_mask, audio_feats)

        # Append the true and predicted values
        predicted_labels.extend(outputs.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(true_labels, predicted_labels)

# Calculate R-squared
r2 = r2_score(true_labels, predicted_labels)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error (MSE): 1.0536
R-squared: -3.2142
