In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# Set CUDA_VISIBLE_DEVICES to limit to GPU 0
os.environ['CUDA_VISIBLE_DEVICES'] = '0'


In [None]:

!pip install peft==0.8.2
!pip install accelerate==0.28.0
!pip install transformers==4.35.2




In [None]:
import torch
#from transformers import BloomForSequenceClassification, BloomTokenizerFast, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import BitsAndBytesConfig
from datasets import load_dataset
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score
from transformers import DataCollatorWithPadding
from copy import deepcopy

In [None]:
# Check CUDA availability and set device
if torch.cuda.is_available():
    torch.cuda.set_device(0)  # Explicitly set to use GPU 0
    device = torch.device('cuda')
    print(f"Using CUDA device {torch.cuda.current_device()}: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("CUDA is not available. Using CPU.")

Using CUDA device 0: Tesla T4


In [None]:
# Preprocess function
def preprocess_data(example):
    encoded = tokenizer(example['sentence'], truncation=True, padding='max_length', max_length=512)
    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
        'labels': example['label']  # SST2 labels are already binary (0 or 1)
    }


In [None]:
# Partition data using Dirichlet distribution
def partition_data(dataset, num_clients, alpha=0.3, seed=0):
    np.random.seed(seed)
    labels = [example['labels'] for example in dataset]
    num_labels = len(set(labels))
    label_indices = {label: [] for label in set(labels)}

    for i, label in enumerate(labels):
        label_indices[label].append(i)

    dirichlet_weights = np.random.dirichlet([alpha] * num_labels, size=num_clients)

    client_data = [[] for _ in range(num_clients)]

    for label, indices in label_indices.items():
        num_indices = len(indices)
        proportions = dirichlet_weights[:, list(label_indices.keys()).index(label)]
        allocations = np.round(proportions * num_indices).astype(int)

        for client_idx in range(num_clients):
            client_data[client_idx].extend(np.random.choice(indices, size=allocations[client_idx], replace=False))

    client_data = [dataset.select(indices) for indices in client_data]
    return client_data


In [None]:
# FedAvg function
def federated_average(global_model, client_models, client_sizes):
    total_size = sum(client_sizes)
    global_dict = global_model.state_dict()

    for k in global_dict.keys():
        if 'lora' in k:  # Only update LoRA parameters
            global_dict[k] = torch.stack([
                client_models[i].state_dict()[k] * (client_sizes[i] / total_size)
                for i in range(len(client_models))
            ]).sum(dim=0)

    global_model.load_state_dict(global_dict)
    return global_model

In [None]:
# Load dataset and preprocess
# tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
# model_name = 'bigscience/bloom-560m'
# model = BloomForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

model_name = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    device_map="auto",
    torch_dtype=torch.float16
)

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query_key_value"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Freeze the A matrix manually
for name, param in model.named_parameters():
    if 'lora_A' in name:
        param.requires_grad = False
    elif 'lora_B' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False  # Freeze all non-LoRA parameters
model.print_trainable_parameters()



The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/lib/python3.11/dist-packages/cv2/../../lib64')}
The following directories listed in your path were found to be non-existent: {PosixPath('/sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events')}
The following directories listed in your path were found to be non-existent: {PosixPath('https'), PosixPath('//mp.kaggle.net')}
The following directories listed in your path were found to be non-existent: {PosixPath('http'), PosixPath('//172.28.0.1'), PosixPath('8013')}
The following directories listed in your path were found to be non-existent: {PosixPath('--logtostderr --listen_host=172.28.0.12 --target_host=172.28.0.12 --tunnel_background_save_url=https'), PosixPath('//colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/gpu-t4-s-3q3idjl8pt1kx --tunnel_background_save_delay=10s --tunnel_periodic_background_save_frequency=30m0s --enable_output_coalesc

RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [None]:
!rm -rf data


In [None]:
!mkdir -p data
!wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip -O data/SST-2.zip
!unzip -o data/SST-2.zip -d data/


--2025-07-13 19:56:11--  https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.33.183.29, 13.33.183.33, 13.33.183.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.33.183.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7439277 (7.1M) [application/zip]
Saving to: â€˜data/SST-2.zipâ€™


2025-07-13 19:56:12 (9.89 MB/s) - â€˜data/SST-2.zipâ€™ saved [7439277/7439277]

Archive:  data/SST-2.zip
   creating: data/SST-2/
  inflating: data/SST-2/dev.tsv      
   creating: data/SST-2/original/
  inflating: data/SST-2/original/README.txt  
  inflating: data/SST-2/original/SOStr.txt  
  inflating: data/SST-2/original/STree.txt  
  inflating: data/SST-2/original/datasetSentences.txt  
  inflating: data/SST-2/original/datasetSplit.txt  
  inflating: data/SST-2/original/dictionary.txt  
  inflating: data/SST-2/original/original_rt_snippets.txt  
  inflating: data/SST-2/original/sentiment_l

In [None]:

# Load the SST2 dataset
df = pd.read_csv("data/SST-2/train.tsv", sep="\t")

# Take only the first 10,000 samples
df = df.head(10000)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)


In [None]:
preprocessed_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Partition dataset among clients using Dirichlet distribution
num_clients = 20
client_datasets = partition_data(preprocessed_dataset, num_clients)


In [None]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": accuracy_score(eval_pred.label_ids, predictions)}

In [None]:
# Federated Learning parameters
local_epochs = 1
num_rounds = 2
clients_per_round = 2

In [None]:
# Define training arguments with GPU 0 specific settings
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=local_epochs,
    #evaluation_strategy="no",
    save_strategy="no",
    fp16=torch.cuda.is_available(),  # Only use fp16 if CUDA is available
    gradient_accumulation_steps=1,
    dataloader_pin_memory=False,
    optim="adamw_torch",
    no_cuda=not torch.cuda.is_available(),  # Use CUDA if available
    remove_unused_columns=False # Add this line
)

data_collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors="pt")



In [None]:
# Initialize federated learning rounds
# global_model = model
# for round_num in range(num_rounds):
#     print(f"Round {round_num + 1}/{num_rounds}")

#     # Sample clients
#     sampled_clients = np.random.choice(range(num_clients), size=clients_per_round, replace=False)

#     client_models = []
#     client_sizes = []
#     for client_id in sampled_clients:
#         print(f"Training client {client_id}")

#         # Create a new base model
#         #base_model = BloomForSequenceClassification.from_pretrained(model_name, num_labels=2)
#         base_model = model.clone()  # If using PEFT's cloning


#         # Create a new PEFT model
#         client_model = get_peft_model(base_model, peft_config)

#         # Load the state dict from the global model
#         client_model.load_state_dict(global_model.state_dict())
#         client_model.to(device)

#         client_dataset = client_datasets[client_id]
#         client_sizes.append(len(client_dataset))

#         trainer = Trainer(
#             model=client_model,
#             args=training_args,
#             train_dataset=client_dataset,
#             data_collator=data_collator,
#             #compute_metrics=compute_metrics
#         )
#         trainer.train()

#         client_models.append(client_model)

#     # Update global model using FedAvg
#     global_model = federated_average(global_model, client_models, client_sizes)

# # Save global model
# global_model.save_pretrained("./bloom_peft_fedavg_model_sst2")

# print("Federated Learning completed successfully!")



Round 1/50
Training client 17


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mstutiii24[0m ([33mstutiii24-prisma-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3324
1000,0.0093
1500,0.0134
2000,0.007
2500,0.0075


Training client 10


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.2927
1000,0.2607
1500,0.2255
2000,0.2833


Training client 4


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.9637
1000,0.7082
1500,0.5958
2000,0.623
2500,0.7318


Training client 2


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.7786
1000,0.6337
1500,0.5829
2000,0.5632


Round 2/50
Training client 0


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.6963
1000,0.628
1500,0.6074
2000,0.5626
2500,0.5123


Training client 17


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.0117
1000,0.0096
1500,0.0135
2000,0.0069
2500,0.0074


Training client 15


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,0.2996
1000,0.2332
1500,0.2308
2000,0.2711


Training client 1


Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Step,Training Loss
500,0.7196


In [None]:
# Reuse model and LoRA config for all clients
global_model = model
global_model.to("cpu")  # keep original on CPU for deepcopy

for round_num in range(num_rounds):
    print(f"\n=== Round {round_num + 1}/{num_rounds} ===")

    sampled_clients = np.random.choice(range(num_clients), size=clients_per_round, replace=False)

    client_models = []
    client_sizes = []

    for client_id in sampled_clients:
        print(f"\nTraining client {client_id}")

        # Clone global model to avoid downloading again
        base_model = deepcopy(global_model)
        client_model = get_peft_model(base_model, peft_config)
        client_model.to(device)

        # Ensure only LoRA B is trainable
        for name, param in client_model.named_parameters():
            param.requires_grad = ('lora_B' in name)

        # Get client data
        client_dataset = client_datasets[client_id]
        client_sizes.append(len(client_dataset))

        # Setup trainer
        trainer = Trainer(
            model=client_model,
            args=training_args,
            train_dataset=client_dataset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        # Train
        trainer.train()

        # Evaluate
        metrics = trainer.evaluate()
        print(f"Client {client_id} Accuracy: {metrics['eval_accuracy']:.4f}")

        client_models.append(client_model)

    # FedAvg across clients
    global_model = federated_average(global_model, client_models, client_sizes)

    # Save checkpoint after each round
    global_model.save_pretrained(f"./bloom_fedavg_round_{round_num + 1}")

print("\n Federated Learning completed!")


=== Round 1/2 ===

Training client 17


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mstutiii24[0m ([33mstutiii24-prisma-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
