<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/_Full_training_pipeline_with_multitask_learning%2C_pruning%2C_and_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install optuna

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# MultiTask Adapter with adapters for task-specific tuning
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Distributed DataParallel initialization
def init_ddp(rank, world_size, model):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    if not dist.is_initialized():
        dist.init_process_group("gloo", rank=rank, world_size=world_size)
    ddp_model = DDP(model, find_unused_parameters=True)
    return ddp_model

# Train model with adapters, logging, and scheduler
def train_with_scheduler(model, train_data, epochs, batch_size, learning_rate, log_dir=None,
                         num_warmup_steps=0, num_training_steps=1000, rank=0):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir) if rank == 0 and log_dir else None
    scaler = GradScaler()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for task, task_data in train_data.items():
            train_dataset = TextDataset(task_data, model.tokenizer, for_classification=True)
            sampler = DistributedSampler(train_dataset, num_replicas=1, rank=rank)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=sampler)

            for batch in train_dataloader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                optimizer.zero_grad()
                with autocast():
                    loss, logits = model(input_ids, attention_mask, task, labels=labels)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()

                total_loss += loss.item()

        if rank == 0 and log_dir:
            writer.add_scalar("Loss/epoch", total_loss / len(train_dataloader), epoch)

    if writer:
        writer.close()

# Learning rate scheduler
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))

    return LambdaLR(optimizer, lr_lambda)

# Evaluation with metrics
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            with autocast():
                _, logits = model(input_ids, attention_mask, task)

            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())

    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Example of usage
nltk.download('wordnet')

train_data = {
    "task1": [{"text": "example sentence for task 1", "label": 0}],
    "task2": [{"text": "example sentence for task 2", "label": 1}]
}
tasks = {"task1": 2, "task2": 2}
model = MultiTaskAdapterFoundationModel("allenai/longformer-base-4096", tasks).to(device)

train_with_scheduler(model, train_data, epochs=3, batch_size=16, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

test_data = [{"text": "example test sentence for task 1", "label": 0}]
test_dataset = TextDataset(test_data, model.tokenizer, for_classification=True)

evaluate_with_metrics(model, test_dataset, "task1")

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=2048, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model with gradient accumulation
def train_with_scheduler(model, train_data, epochs=5, batch_size=4, accumulation_steps=8, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for task, task_data in train_data.items():
            tokenizer = model.tokenizer
            train_dataset = TextDataset(task_data, tokenizer, for_classification=True)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            for batch_idx, batch in enumerate(train_dataloader):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                loss, _ = model(input_ids, attention_mask, task, labels=labels)

                loss = loss / accumulation_steps
                loss.backward()

                if (batch_idx + 1) % accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

                total_loss += loss.item() * accumulation_steps

        writer.add_scalar("Loss/train", total_loss / len(train_dataloader), epoch)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_dataloader)}")
    writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Synonym replacement for data augmentation
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random.shuffle(words)

    num_replaced = 0
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word and num_replaced < n else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return " ".join(new_words)

# Ensure wordnet is downloaded
import nltk
nltk.download('wordnet')

# Example usage
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)

    # Example tasks and training data
    tasks = {"classification": 3, "sentiment": 2}
    train_data = {
        "classification": [{"text": "Sample input for classification", "label": 0}],
        "sentiment": [{"text": "Sample input for sentiment analysis", "label": 1}]
    }

    # Initialize model
    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks=tasks)
    model = model.to(device)

    # Train model
    train_with_scheduler(model, train_data, epochs=3, batch_size=4, accumulation_steps=8, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

    # Test data
    test_data = {
        "classification": TextDataset([{"text": "Test input", "label": 0}], model.tokenizer, for_classification=True),
        "sentiment": TextDataset([{"text": "Another test input", "label": 1}], model.tokenizer, for_classification=True)
    }

    # Evaluate model
    for task, dataset in test_data.items():
        print(f"Evaluating task: {task}")
        evaluate_with_metrics(model, dataset, task)

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=2048, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = item["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model with gradient accumulation
def train_with_scheduler(model, train_data, epochs=5, batch_size=4, accumulation_steps=8, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for task, task_data in train_data.items():
            tokenizer = model.tokenizer
            train_dataset = task_data
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            for batch_idx, batch in enumerate(train_dataloader):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                loss, _ = model(input_ids, attention_mask, task, labels=labels)

                loss = loss / accumulation_steps
                loss.backward()

                if (batch_idx + 1) % accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

                total_loss += loss.item() * accumulation_steps

        writer.add_scalar("Loss/train", total_loss / len(train_dataloader), epoch)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_dataloader)}")
    writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Synonym replacement for data augmentation
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random.shuffle(words)

    num_replaced = 0
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word and num_replaced < n else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return " ".join(new_words)

# Ensure wordnet is downloaded
import nltk
nltk.download('wordnet')

# Initialize tokenizer
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

# Augmenting the dataset with more examples and synonym replacement
texts = [
    {"text": "The quick brown fox jumps over the lazy dog.", "label": 0},
    {"text": "A journey of a thousand miles begins with a single step.", "label": 0},
    {"text": "To be or not to be, that is the question.", "label": 0},
    {"text": "All that glitters is not gold.", "label": 0},
    {"text": "The early bird catches the worm.", "label": 1},
    {"text": "A picture is worth a thousand words.", "label": 1},
    {"text": "Better late than never.", "label": 1},
    {"text": "Actions speak louder than words.", "label": 1}
]

# Augmenting data with synonyms
augmented_texts = []
for text in texts:
    for _ in range(3):  # Create 3 augmented versions of each sentence
        augmented_text = synonym_replacement(text["text"])
        augmented_texts.append({"text": augmented_text, "label": text["label"]})
texts.extend(augmented_texts)

# Shuffle the data to ensure randomness
random.shuffle(texts)

# Split data into training and validation sets again
train_data, val_data = train_test_split(texts, test_size=0.2, random_state=42)

# Create datasets and dataloaders
train_dataset = TextDataset(train_data, tokenizer, for_classification=True)
val_dataset = TextDataset(val_data, tokenizer, for_classification=True)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Train model again with the augmented dataset
model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks={"classification": 3, "sentiment": 2}).to(device)
train_with_scheduler(model, {"classification": train_dataset, "sentiment": train_dataset}, epochs=5, batch_size=4, accumulation_steps=8, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

# Evaluate model
for task, dataset in {"classification": val_dataset, "sentiment": val_dataset}.items():
    print(f"Evaluating task: {task}")
    evaluate_with_metrics(model, dataset, task)

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model with gradient accumulation
def train_with_scheduler(model, train_data, epochs=5, batch_size=8, accumulation_steps=4, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for task, task_data in train_data.items():
            tokenizer = model.tokenizer
            train_dataset = TextDataset(task_data, tokenizer, for_classification=True)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            for batch_idx, batch in enumerate(train_dataloader):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                loss, _ = model(input_ids, attention_mask, task, labels=labels)
                loss = loss / accumulation_steps
                loss.backward()
                if (batch_idx + 1) % accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                total_loss += loss.item() * accumulation_steps
        writer.add_scalar("Loss/train", total_loss / len(train_dataloader), epoch)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_dataloader)}")
    writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Synonym replacement for data augmentation
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random.shuffle(words)

    num_replaced = 0
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word and num_replaced < n else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return " ".join(new_words)

# Ensure wordnet is downloaded
import nltk
nltk.download('wordnet')

# Example usage
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)

    # Example tasks and training data
    tasks = {"classification": 3, "sentiment": 2}
    train_data = {
        "classification": [{"text": "Sample input for classification", "label": 0}],
        "sentiment": [{"text": "Sample input for sentiment analysis", "label": 1}]
    }

    # Initialize model
    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks=tasks)
    model = model.to(device)

    # Train model
    train_with_scheduler(model, train_data, epochs=3, batch_size=8, accumulation_steps=4, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

    # Test data
    test_data = {
        "classification": TextDataset([{"text": "Test input", "label": 0}], model.tokenizer, for_classification=True),
        "sentiment": TextDataset([{"text": "Another test input", "label": 1}], model.tokenizer, for_classification=True)
    }

    # Evaluate model
    for task, dataset in test_data.items():
        print(f"Evaluating task: {task}")
        evaluate_with_metrics(model, dataset, task)

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model with gradient accumulation
def train_with_scheduler(model, train_data, epochs=5, batch_size=16, accumulation_steps=2, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for task, task_data in train_data.items():
            tokenizer = model.tokenizer
            train_dataset = TextDataset(task_data, tokenizer, for_classification=True)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            for batch_idx, batch in enumerate(train_dataloader):
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                loss, _ = model(input_ids, attention_mask, task, labels=labels)
                loss = loss / accumulation_steps
                loss.backward()
                if (batch_idx + 1) % accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                total_loss += loss.item() * accumulation_steps
        writer.add_scalar("Loss/train", total_loss / len(train_dataloader), epoch)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_dataloader)}")
    writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Synonym replacement for data augmentation
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random.shuffle(words)

    num_replaced = 0
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word and num_replaced < n else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return " ".join(new_words)

# Ensure wordnet is downloaded
import nltk
nltk.download('wordnet')

# Example usage
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)

    # Example tasks and training data
    tasks = {"classification": 3, "sentiment": 2}
    train_data = {
        "classification": [{"text": "Sample input for classification", "label": 0}],
        "sentiment": [{"text": "Sample input for sentiment analysis", "label": 1}]
    }

    # Initialize model
    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks=tasks)
    model = model.to(device)

    # Train model
    train_with_scheduler(model, train_data, epochs=3, batch_size=16, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

    # Test data
    test_data = {
        "classification": TextDataset([{"text": "Test input", "label": 0}], model.tokenizer, for_classification=True),
        "sentiment": TextDataset([{"text": "Another test input", "label": 1}], model.tokenizer, for_classification=True)
    }

    # Evaluate model
    for task, dataset in test_data.items():
        print(f"Evaluating task: {task}")
        evaluate_with_metrics(model, dataset, task)

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model
def train_with_scheduler(model, train_data, epochs=5, batch_size=32, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for task, task_data in train_data.items():
            tokenizer = model.tokenizer
            train_dataset = TextDataset(task_data, tokenizer, for_classification=True)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            for batch_idx, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                loss, _ = model(input_ids, attention_mask, task, labels=labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
        scheduler.step()
        writer.add_scalar("Loss/train", total_loss / len(train_dataloader), epoch)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_dataloader)}")
    writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Main function
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)

    # Example tasks and training data
    tasks = {"classification": 3, "sentiment": 2}
    train_data = {
        "classification": [{"text": "Sample input for classification", "label": 0}],
        "sentiment": [{"text": "Sample input for sentiment analysis", "label": 1}]
    }

    # Initialize model
    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks=tasks)
    model = model.to(device)

    # Train model
    train_with_scheduler(model, train_data, epochs=3, batch_size=16, learning_rate=5e-5, num_warmup_steps=100, num_training_steps=1000)

    # Test data
    test_data = {
        "classification": TextDataset([{"text": "Test input", "label": 0}], model.tokenizer, for_classification=True),
        "sentiment": TextDataset([{"text": "Another test input", "label": 1}], model.tokenizer, for_classification=True)
    }

    # Evaluate model
    for task, dataset in test_data.items():
        print(f"Evaluating task: {task}")
        evaluate_with_metrics(model, dataset, task)

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel, LongformerTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import wordnet
from torch.optim.lr_scheduler import LambdaLR
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
import torch.nn.utils.prune as prune
import optuna

# Device configuration
device = torch.device("cpu")  # Change to "cuda" if GPU is available

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096, for_classification=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.for_classification = for_classification

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.for_classification:
            label = self.data[idx]["label"]
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

# Define LongformerFoundationModel class
class LongformerFoundationModel(nn.Module):
    def __init__(self, model_name="allenai/longformer-base-4096"):
        super(LongformerFoundationModel, self).__init__()
        self.model = LongformerModel.from_pretrained(model_name)
        self.tokenizer = LongformerTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        # Use sliding window attention for long sequences
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        return outputs

# Define Adapter class
class Adapter(nn.Module):
    def __init__(self, input_dim, adapter_dim=64):
        super(Adapter, self).__init__()
        self.down_proj = nn.Linear(input_dim, adapter_dim)
        self.up_proj = nn.Linear(adapter_dim, input_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return x + self.dropout(self.up_proj(self.activation(self.down_proj(x))))

# Define MultiTaskAdapterFoundationModel class
class MultiTaskAdapterFoundationModel(LongformerFoundationModel):
    def __init__(self, model_name="allenai/longformer-base-4096", tasks=None, adapter_dim=64):
        super().__init__(model_name)
        self.tasks = tasks or {}
        self.classifiers = nn.ModuleDict({
            task: nn.Linear(self.model.config.hidden_size, num_labels) for task, num_labels in self.tasks.items()
        })

    def forward(self, input_ids, attention_mask, task, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, global_attention_mask=(attention_mask == 1))
        hidden_states = outputs.last_hidden_state
        logits = self.classifiers[task](hidden_states[:, 0, :])  # CLS token
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifiers[task].out_features), labels.view(-1))
        return loss, logits

# Initialize DDP
def init_ddp(rank, world_size, model):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    if not dist.is_initialized():
        dist.init_process_group("gloo", rank=rank, world_size=world_size)
    ddp_model = DDP(model, find_unused_parameters=True)
    return ddp_model

# Scheduler for learning rate
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    return LambdaLR(optimizer, lr_lambda)

# Train the multitask model
def train_with_scheduler(model, train_data, epochs=5, batch_size=32, learning_rate=5e-5, log_dir="./logs", num_warmup_steps=500, num_training_steps=10000, rank=None):
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)
    writer = SummaryWriter(log_dir=log_dir) if rank == 0 else None
    scaler = GradScaler()
    checkpoint_dir = "./checkpoints"
    if not os.path.exists(checkpoint_dir) and rank == 0:
        os.makedirs(checkpoint_dir)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for task, task_data in train_data.items():
            # Fix for tokenizer access
            tokenizer = model.module.tokenizer if hasattr(model, "module") else model.tokenizer
            train_dataset = TextDataset(task_data, tokenizer, for_classification=True)
            sampler = DistributedSampler(train_dataset, num_replicas=1, rank=rank)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=sampler)
            for batch_idx, batch in enumerate(train_dataloader):
                optimizer.zero_grad()
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                with autocast():
                    loss, logits = model(input_ids, attention_mask, task, labels=labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                total_loss += loss.item()
                if rank == 0:
                    writer.add_scalar(f"Loss/train_{task}", loss.item(), epoch * len(train_dataloader) + batch_idx)
        if rank == 0:
            print(f"Epoch [{epoch + 1}/{epochs}], Task: {task}, Loss: {total_loss / len(train_dataloader)}")
            torch.save(model.state_dict(), f"./checkpoints/model_epoch_{epoch+1}.pt")
    if rank == 0:
        writer.close()

# Evaluation function
def evaluate_with_metrics(model, test_data, task, batch_size=32):
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            with autocast():
                _, logits = model(input_ids, attention_mask, task)
            predictions = torch.argmax(logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_preds, all_labels, average='weighted')
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return precision, recall, f1

# Synonym replacement
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random.shuffle(words)
    num_replaced = 0
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word and num_replaced < n else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return " ".join(new_words)

# Download NLTK data
import nltk
nltk.download('wordnet')

# Objective function for Optuna
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32, 64])
    adapter_dim = trial.suggest_int('adapter_dim', 16, 128, step=16)

    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096",
                                            tasks={"task1": 2}, adapter_dim=adapter_dim)
    model = model.to(device)

    train_data = {"task1": [{"text": "Example sentence", "label": 1}]}  # Replace with actual training data
    train_with_scheduler(model, train_data, epochs=3, batch_size=batch_size,
                         learning_rate=learning_rate, num_warmup_steps=100, num_training_steps=500)

    test_data = TextDataset([{"text": "Example sentence", "label": 1}],
                            model.tokenizer, for_classification=True)  # Replace with actual test data
    precision, recall, f1 = evaluate_with_metrics(model, test_data, task="task1", batch_size=batch_size)

    return f1  # Maximize F1 score

# Run Optuna study
def run_optuna():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)
    print(f"Best trial: {study.best_trial.params}")

# Apply pruning
def apply_pruning(model, amount=0.3):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name="weight", amount=amount)
            prune.remove(module, "weight")  # Remove reparameterization after pruning
    print("Pruning applied successfully!")

# Check sparsity
def check_sparsity(model):
    total_params = 0
    zero_params = 0
    for name, param in model.named_parameters():
        if "weight" in name:
            total_params += param.numel()
            zero_params += (param == 0).sum().item()
    sparsity = 100.0 * zero_params / total_params
    print(f"Model sparsity: {sparsity:.2f}%")

if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    torch.manual_seed(42)

    # Example tasks and training data
    tasks = {"classification": 3, "sentiment": 2}
    train_data = {
        "classification": [{"text": "Sample input for classification", "label": 0}],
        "sentiment": [{"text": "Sample input for sentiment analysis", "label": 1}]
    }

    # Initialize model
    model = MultiTaskAdapterFoundationModel(model_name="allenai/longformer-base-4096", tasks=tasks)
    model = model.to(device)

    # Train model
    train_with_scheduler(model, train_data, epochs=3, batch_size=16, learning_rate=5e-5,
                         num_warmup_steps=100, num_training_steps=1000)

    # Apply pruning
    apply_pruning(model, amount=0.3)
    check_sparsity(model)

    # Evaluate
    test_data = {
        "classification": TextDataset([{"text": "Test input", "label": 0}], model.tokenizer, for_classification=True),
        "sentiment": TextDataset([{"text": "Another test input", "label": 1}], model.tokenizer, for_classification=True)
    }

    for task, dataset in test_data.items():
        print(f"Evaluating task: {task}")
        evaluate_with_metrics(model, dataset, task)

    # Run Optuna
    run_optuna()