## **Task 0:** Set up a pre-trained LLM


### Import pre-trained GPT-2 from the hugging-face


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"use device: {device}")


model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

print(model)

use device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)




### Generate text by GPT-2

In [2]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = tokenizer.pad_token_id

prompt = "GPT2 is a model developed by OpenAI."

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)

gen_text = tokenizer.batch_decode(gen_tokens)[0]
print("*"*20)
print(f"Prompt: {prompt}")
print(f"Tokens: {input_ids}")
print(f"Answer: {gen_text}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


********************
Prompt: GPT2 is a model developed by OpenAI.
Tokens: tensor([[   38, 11571,    17,   318,   257,  2746,  4166,   416,  4946, 20185,
            13]], device='cuda:0')
Answer: GPT2 is a model developed by OpenAI. It is very efficient, but not exactly stable with modern hardware acceleration systems. The OpenAI Team's engineers designed the GPT2 to work on an internal state machine, a state machine that represents the current state in a distributed network, rather than the external state model, which is the virtual machine that is running the game. This means that if the GPT2 is running an external state machine, in which case both the environment and the world


## **Task 1:** Prepare local training data

**Install huggingface datasets library**

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

### Load datasets

In [4]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [13]:
def tokenize_function(examples):
    inputs = [
        f"Are Sentence1 and Sentence2 equivalent?\nSentence 1: {s1}\nSentence 2: {s2}\nOptions: -- equivalent\n             -- not equivalent\nOutput:"
        for s1, s2 in zip(examples["sentence1"], examples["sentence2"])
    ]

    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)

    # Ensure 'labels' is aligned with 'inputs' and 'model_inputs'
    labels = [
        "equivalent" if label == 1 else "not equivalent"
        for label in examples["label"]
    ]
    tokenized_labels = tokenizer(labels, padding="max_length", truncation=True)

    model_inputs["labels"] = [
        [-100 if token != tokenizer.pad_token_id else token for token in label]
        for label in tokenized_labels['input_ids']
    ]


    return model_inputs


In [14]:
import numpy as np
import json

raw_train_dataset = raw_datasets["train"]
example = np.random.choice(raw_train_dataset)
formatted_json = json.dumps(example, indent=13)
print(formatted_json)

{
             "sentence1": "Negotiators talked with the boy for more than an hour , and SWAT officers surrounded the classroom , Bragdon said .",
             "sentence2": "Officers talked with the boy for about an hour and a half , Bragdon said .",
             "label": 0,
             "idx": 3149
}


In [10]:
# Applying tokenize_function to convert dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=["sentence1", "sentence2", "label", "idx"])

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

**Install evaluation tools**

In [None]:
!pip install evaluate
!pip install rouge_score



**Evaluate the performance of GPT-2 on the dataset**

In [None]:
import evaluate
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)

metric = evaluate.load('rouge')
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=True, batch_size=1, collate_fn=data_collator)
model.eval()
for batch in eval_dataloader:
    input_len = torch.nonzero(batch['labels'][0] != -100).squeeze()[0]
    input_ids = batch['input_ids'][0][:input_len.item()].unsqueeze(0).to(device)  # input_ids moved to GPU

    # Correctly define label from the input batch
    label = tokenizer.batch_decode(batch['input_ids'])[0]  # Decode the input ids to get the label text
    label = label.split('<|endoftext|>')[0].split('Output:')[-1]

    with torch.no_grad():
        gen_tokens = model.generate(
            input_ids,  # input_ids are now on GPU
            do_sample=True,
            temperature=0.9,
            max_length=200,
        )
        gen_text = tokenizer.batch_decode(gen_tokens.to('cpu'))[0].split('Output:')[-1]  # Decoding on CPU

    gen_text = gen_text.split('<|endoftext|>')[0]

    # Adding predictions and labels for evaluation
    metric.add_batch(predictions=[gen_text], references=[label])

# Compute and display metric results
metric.compute()


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}

## **Task 2:** Fine-tuning GPT-2 with LoRA

**Install parameter-efficient fine-tuning library**


In [None]:
!pip install peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2


In [None]:
# Adding LoRA layers into the pre-trained GPT-2
from peft import get_peft_model, LoraConfig, TaskType

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)

lora_model = get_peft_model(model, config)

In [None]:
# Fine-tuning GPT-2 with LoRA on the dataset
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
accumulation_steps = 2  # Accumulate gradients over 2 steps

global_step = 0 # Initialize a global step counter

for epoch in range(num_epochs):
    for batch in train_dataloader:
        model.train()
        # Move batch tensors to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss / accumulation_steps  # Scale loss by accumulation steps
        loss.backward()

        global_step += 1 # Increment the global step counter
        if (global_step + 1) % accumulation_steps == 0:  # Update weights every accumulation_steps
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

**Evaluation of the fine-tuned model**


In [None]:
import evaluate
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)


metric = evaluate.load('rouge')
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=True, batch_size=1, collate_fn=data_collator)
model.eval()
for batch in eval_dataloader:
    input_len=torch.nonzero(batch['labels'][0]!=-100).squeeze()[0]
    input_ids = batch['input_ids'][0][:input_len.item()].unsqueeze(0)

    label = tokenizer.batch_decode(batch['input_ids'])[0]
    label = label.split('<|endoftext|>')[0].split('Output:')[-1]
    # break
    with torch.no_grad():
        gen_tokens = model.generate(
            input_ids.to(device),
            do_sample=True,
            temperature=0.9,
            max_length=200,
        )
        gen_text = tokenizer.batch_decode(gen_tokens.to('cpu'))[0].split('Output:')[-1]

    gen_text = gen_text.split('<|endoftext|>')[0]
    metric.add_batch(predictions=[gen_text], references=[label])

metric.compute()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}

## **Task 3:** Fine-tuning LLM with federated learning


In [None]:
!pip install transformers datasets peft



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch.optim as optim

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load GPT-2 and tokenizer
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token for GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use end-of-sequence token as padding token

# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)

# Load and split GLUE MRPC dataset into three parts (for three clients)
raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets['train'].column_names)

# Make sure raw_datasets is used, not already tokenized data
client1_data = raw_datasets['train'].shard(index=0, num_shards=3)
client2_data = raw_datasets['train'].shard(index=1, num_shards=3)
client3_data = raw_datasets['train'].shard(index=2, num_shards=3)

# Update the preprocess_data function to rename 'label' to 'labels'
def preprocess_data(examples):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

    # Align the labels with the inputs
    tokenized_inputs['labels'] = tokenized_inputs['input_ids'].copy()  # Make sure labels match input size

    return tokenized_inputs

# Apply tokenization to each client's data
client1_data = client1_data.map(preprocess_data, batched=True)
client2_data = client2_data.map(preprocess_data, batched=True)
client3_data = client3_data.map(preprocess_data, batched=True)

# Make sure 'labels' column is present in dataloader
def create_dataloader(client_data):
    client_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return DataLoader(client_data, batch_size=4, shuffle=True)


client1_dataloader = create_dataloader(client1_data)
client2_dataloader = create_dataloader(client2_data)
client3_dataloader = create_dataloader(client3_data)

# Fine-tune the model for each client
def fine_tune_for_client(client_dataloader, model, lora_config, device):
    # Apply LoRA to the GPT-2 model
    lora_model = get_peft_model(model, lora_config).to(device)
    optimizer = optim.AdamW(lora_model.parameters(), lr=3e-5)

    # Fine-tune model
    for epoch in range(2):  # Just 2 epochs for example purposes
        for batch in client_dataloader:  # Corrected line
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = lora_model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    return lora_model.state_dict()  # Return fine-tuned model's parameters

# Fine-tune for each client
client1_model_params = fine_tune_for_client(client1_dataloader, model, lora_config, device)
client2_model_params = fine_tune_for_client(client2_dataloader, model, lora_config, device)
client3_model_params = fine_tune_for_client(client3_dataloader, model, lora_config, device)

# Federated Averaging (FedAvg) - Aggregating the models
def fed_avg(models):
    avg_model = {}
    for key in models[0].keys():
        avg_model[key] = sum([model[key] for model in models]) / len(models)
    return avg_model

# Aggregate the models from all clients
aggregated_model_params = fed_avg([client1_model_params, client2_model_params, client3_model_params])




['sentence1', 'sentence2', 'label', 'idx']


Map:   0%|          | 0/1223 [00:00<?, ? examples/s]

Map:   0%|          | 0/1223 [00:00<?, ? examples/s]

Map:   0%|          | 0/1222 [00:00<?, ? examples/s]

**Evaluation of the aggregated model**

In [None]:
lora_model = get_peft_model(model, lora_config).to(device)   # Load aggregated parameters into the global model
lora_model.load_state_dict(aggregated_model_params)   # Load aggregated parameters into the LoRA-enabled model

# Evaluate the aggregated model
def evaluate_model(model, validation_data):
    model.eval()
    total_loss = 0
    for batch in validation_data:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            total_loss += outputs.loss.item()
    return total_loss / len(validation_data)

# Using GLUE MRPC validation set for evaluation
validation_data = raw_datasets['validation']
validation_data = validation_data.map(preprocess_data, batched=True)
validation_dataloader = create_dataloader(validation_data)

# Evaluate the aggregated model
aggregated_model_loss = evaluate_model(model, validation_dataloader)
print(f"Aggregated Model Loss: {aggregated_model_loss}")


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Aggregated Model Loss: 5.0465040510776
