In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Define the Model Architecture
class SentenceTransformerMTL(nn.Module):
    def __init__(self, model_name, num_classes):
        super(SentenceTransformerMTL, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_classes)  # Classification head

    def forward(self, input_ids, attention_mask):
        # Get sentence embeddings
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = transformer_outputs.last_hidden_state.mean(dim=1)  # Mean pooling for sentence embeddings

        # Classification output
        class_logits = self.classifier(embeddings)
        return embeddings, class_logits

# Step 2: Prepare Synthetic Dataset for Classification
class SyntheticDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, label

# Sample sentences and labels
sentences = ["I love this product!", "It's okay, nothing special.", "This is disappointing.",
             "Amazing experience!", "Not worth the price.", "Average at best."]
labels = [2, 1, 0, 2, 0, 1]  # 2: positive, 1: neutral, 0: negative

# Initialize tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare dataset and dataloaders
dataset = SyntheticDataset(sentences, labels, tokenizer)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 3: Initialize Model, Loss Function, Optimizer
num_classes = 3
model = SentenceTransformerMTL(model_name, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Step 4: Training Loop
def train_model(model, data_loader, criterion, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for input_ids, attention_mask, labels in data_loader:
            optimizer.zero_grad()
            _, class_logits = model(input_ids, attention_mask)
            loss = criterion(class_logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Step 5: Testing/Evaluation
def evaluate_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            _, class_logits = model(input_ids, attention_mask)
            preds = torch.argmax(class_logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model
evaluate_model(model, train_loader)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1/3, Loss: 1.0618
Epoch 2/3, Loss: 0.9088
Epoch 3/3, Loss: 0.6538
Test Accuracy: 1.0000


In [3]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Step 1: Define the Model Architecture with Dual Heads
class SentenceTransformerMTL(nn.Module):
    def __init__(self, model_name, num_classes_task_a, num_classes_task_b):
        super(SentenceTransformerMTL, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.classifier_task_a = nn.Linear(self.transformer.config.hidden_size, num_classes_task_a)  # Task A head
        self.classifier_task_b = nn.Linear(self.transformer.config.hidden_size, num_classes_task_b)  # Task B head

    def forward(self, input_ids, attention_mask):
        # Get sentence embeddings
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = transformer_outputs.last_hidden_state.mean(dim=1)  # Mean pooling for sentence embeddings

        # Task-specific outputs
        class_logits_task_a = self.classifier_task_a(embeddings)
        class_logits_task_b = self.classifier_task_b(embeddings)
        return embeddings, class_logits_task_a, class_logits_task_b

# Step 2: Prepare Synthetic Dataset for Both Tasks
class MultiTaskDataset(Dataset):
    def __init__(self, sentences, labels_task_a, labels_task_b, tokenizer, max_len=128):
        self.sentences = sentences
        self.labels_task_a = labels_task_a
        self.labels_task_b = labels_task_b
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label_task_a = self.labels_task_a[idx]
        label_task_b = self.labels_task_b[idx]
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, label_task_a, label_task_b

# Sample sentences and labels
sentences = ["I love this product!", "It's okay, nothing special.", "This is disappointing.",
             "Amazing experience!", "Not worth the price.", "Average at best."]
labels_task_a = [2, 1, 0, 2, 0, 1]  # Task A labels (e.g., product categories)
labels_task_b = [2, 1, 0, 2, 0, 1]  # Task B labels (e.g., 2: positive, 1: neutral, 0: negative)

# Initialize tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare dataset and dataloaders
dataset = MultiTaskDataset(sentences, labels_task_a, labels_task_b, tokenizer)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Step 3: Initialize Model, Loss Functions, Optimizer
num_classes_task_a = 3  # Number of classes for Task A
num_classes_task_b = 3  # Number of classes for Task B (Sentiment Analysis)
model = SentenceTransformerMTL(model_name, num_classes_task_a, num_classes_task_b)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Step 4: Training Loop for Multi-Task Learning
def train_model(model, data_loader, criterion, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for input_ids, attention_mask, labels_task_a, labels_task_b in data_loader:
            optimizer.zero_grad()
            _, class_logits_task_a, class_logits_task_b = model(input_ids, attention_mask)

            # Compute loss for each task
            loss_task_a = criterion(class_logits_task_a, labels_task_a)
            loss_task_b = criterion(class_logits_task_b, labels_task_b)
            loss = loss_task_a + loss_task_b  # Combined loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Step 5: Testing/Evaluation
def evaluate_model(model, data_loader):
    model.eval()
    all_preds_task_a, all_preds_task_b, all_labels_task_a, all_labels_task_b = [], [], [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels_task_a, labels_task_b in data_loader:
            _, class_logits_task_a, class_logits_task_b = model(input_ids, attention_mask)
            preds_task_a = torch.argmax(class_logits_task_a, dim=1)
            preds_task_b = torch.argmax(class_logits_task_b, dim=1)

            all_preds_task_a.extend(preds_task_a.cpu().numpy())
            all_preds_task_b.extend(preds_task_b.cpu().numpy())
            all_labels_task_a.extend(labels_task_a.cpu().numpy())
            all_labels_task_b.extend(labels_task_b.cpu().numpy())

    accuracy_task_a = accuracy_score(all_labels_task_a, all_preds_task_a)
    accuracy_task_b = accuracy_score(all_labels_task_b, all_preds_task_b)
    print(f"Task A Accuracy: {accuracy_task_a:.4f}")
    print(f"Task B Accuracy (Sentiment): {accuracy_task_b:.4f}")

# Evaluate the model
evaluate_model(model, train_loader)




Epoch 1/3, Loss: 2.2197
Epoch 2/3, Loss: 1.9466
Epoch 3/3, Loss: 1.6478
Task A Accuracy: 1.0000
Task B Accuracy (Sentiment): 0.8333


In [5]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Define the Model Architecture with Dual Heads
class SentenceTransformerMTL(nn.Module):
    def __init__(self, model_name, num_classes_task_a, num_classes_task_b):
        super(SentenceTransformerMTL, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self.classifier_task_a = nn.Linear(self.transformer.config.hidden_size, num_classes_task_a)  # Task A head
        self.classifier_task_b = nn.Linear(self.transformer.config.hidden_size, num_classes_task_b)  # Task B head

    def forward(self, input_ids, attention_mask):
        # Get sentence embeddings
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = transformer_outputs.last_hidden_state.mean(dim=1)  # Mean pooling for sentence embeddings

        # Task-specific outputs
        class_logits_task_a = self.classifier_task_a(embeddings)
        class_logits_task_b = self.classifier_task_b(embeddings)
        return embeddings, class_logits_task_a, class_logits_task_b

# Prepare Synthetic Dataset for Both Tasks
class MultiTaskDataset(Dataset):
    def __init__(self, sentences, labels_task_a, labels_task_b, tokenizer, max_len=128):
        self.sentences = sentences
        self.labels_task_a = labels_task_a
        self.labels_task_b = labels_task_b
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label_task_a = self.labels_task_a[idx]
        label_task_b = self.labels_task_b[idx]
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, label_task_a, label_task_b

# Sample sentences and labels
sentences = ["I love this product!", "It's okay, nothing special.", "This is disappointing.",
             "Amazing experience!", "Not worth the price.", "Average at best."]
labels_task_a = [2, 1, 0, 2, 0, 1]  # Task A labels (e.g., product categories)
labels_task_b = [2, 1, 0, 2, 0, 1]  # Task B labels (e.g., 2: positive, 1: neutral, 0: negative)

# Initialize tokenizer and dataset
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = MultiTaskDataset(sentences, labels_task_a, labels_task_b, tokenizer)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize Model and Layer-Wise Learning Rates
num_classes_task_a = 3
num_classes_task_b = 3
model = SentenceTransformerMTL(model_name, num_classes_task_a, num_classes_task_b)

# Define learning rates
base_lr = 1e-5  # Lowest learning rate for earlier layers
head_lr = 1e-4  # Higher learning rate for task-specific heads

# Define optimizer with layer-wise learning rates using DistilBERT's layer structure
optimizer = optim.Adam([
    {"params": model.transformer.transformer.layer[:2].parameters(), "lr": base_lr * 0.5},  # Lower layers
    {"params": model.transformer.transformer.layer[2:4].parameters(), "lr": base_lr},       # Middle layers
    {"params": model.transformer.transformer.layer[4:].parameters(), "lr": base_lr * 1.5},  # Higher layers
    {"params": model.classifier_task_a.parameters(), "lr": head_lr},                    # Task A head
    {"params": model.classifier_task_b.parameters(), "lr": head_lr},                    # Task B head
])

# Training Loop for Multi-Task Learning with Layer-Wise Learning Rates
def train_model(model, data_loader, criterion, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for input_ids, attention_mask, labels_task_a, labels_task_b in data_loader:
            optimizer.zero_grad()
            _, class_logits_task_a, class_logits_task_b = model(input_ids, attention_mask)

            # Compute loss for each task
            loss_task_a = criterion(class_logits_task_a, labels_task_a)
            loss_task_b = criterion(class_logits_task_b, labels_task_b)
            loss = loss_task_a + loss_task_b  # Combined loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Define loss function
criterion = nn.CrossEntropyLoss()

# Train the model
train_model(model, train_loader, criterion, optimizer)

# Testing/Evaluation Function
def evaluate_model(model, data_loader):
    model.eval()
    all_preds_task_a, all_preds_task_b, all_labels_task_a, all_labels_task_b = [], [], [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels_task_a, labels_task_b in data_loader:
            _, class_logits_task_a, class_logits_task_b = model(input_ids, attention_mask)
            preds_task_a = torch.argmax(class_logits_task_a, dim=1)
            preds_task_b = torch.argmax(class_logits_task_b, dim=1)

            all_preds_task_a.extend(preds_task_a.cpu().numpy())
            all_preds_task_b.extend(preds_task_b.cpu().numpy())
            all_labels_task_a.extend(labels_task_a.cpu().numpy())
            all_labels_task_b.extend(labels_task_b.cpu().numpy())

    accuracy_task_a = accuracy_score(all_labels_task_a, all_preds_task_a)
    accuracy_task_b = accuracy_score(all_labels_task_b, all_preds_task_b)
    print(f"Task A Accuracy: {accuracy_task_a:.4f}")
    print(f"Task B Accuracy (Sentiment): {accuracy_task_b:.4f}")

# Evaluate the model
evaluate_model(model, train_loader)


Epoch 1/3, Loss: 2.2720
Epoch 2/3, Loss: 2.0889
Epoch 3/3, Loss: 1.9149
Task A Accuracy: 0.6667
Task B Accuracy (Sentiment): 1.0000
