#  Federated GPT-2 Tuning with Parameter Efficient methods in FATE-LLM

In [8]:
%%save_to_fate_llm model sigmoid.py

import torch as t

class Sigmoid(t.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.sigmoid = t.nn.Sigmoid()
        
    def forward(self, x):
        return self.sigmoid(x.logits)

In [9]:
# build CustModel with PELLM, and add a classifier head
from transformers import GPT2Config

checkpoint_path = gpt2_dir
model = t.nn.Sequential(
    t.nn.CustModel(module_name='pellm.gpt2', class_name='GPT2', 
                   pretrained_path=checkpoint_path, 
                   peft_config=lora_config.to_dict(), peft_type="LoraConfig", 
                   num_labels=1,  pad_token_id=50256),
    t.nn.CustModel(module_name='sigmoid', class_name='Sigmoid')
)


Please note that during the training process, only trainable parameters will participate in the federated learning process.

## Submit Federated Task
Once you have successfully completed local testing, We can submit a task to FATE. Please notice that this tutorial is ran on a standalone version. **Please notice that in this tutorial we are using a standalone version, if you are using a cluster version, you need to bind the data with the corresponding name&namespace on each machine.**

In this example we load pretrained weights for gpt2 model.

In [1]:
from fate_llm.dataset.detr_tokenizer import DetrTokenizer

test = DetrTokenizer()

test.load('/data/projects/fate/examples/data/coco_smol/fedA/')
test[0]

  from .autonotebook import tqdm as notebook_tqdm
2023-12-05 17:48:59.479882: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
pre create processor
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
post create processor


loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


{'image': tensor([[[0.2627, 0.2039, 0.2157,  ..., 0.8314, 0.8314, 0.8314],
          [0.3020, 0.1333, 0.2078,  ..., 0.8353, 0.8353, 0.8353],
          [0.4078, 0.2510, 0.1529,  ..., 0.8353, 0.8353, 0.8353],
          ...,
          [0.2706, 0.2392, 0.2314,  ..., 0.0745, 0.0784, 0.0706],
          [0.2784, 0.2863, 0.2549,  ..., 0.0549, 0.0784, 0.0745],
          [0.2549, 0.3098, 0.2824,  ..., 0.0431, 0.0863, 0.0784]],
 
         [[0.2314, 0.1804, 0.1961,  ..., 0.9059, 0.9059, 0.9059],
          [0.2706, 0.1098, 0.1843,  ..., 0.9098, 0.9098, 0.9098],
          [0.3765, 0.2275, 0.1294,  ..., 0.9098, 0.9098, 0.9098],
          ...,
          [0.2314, 0.2039, 0.2039,  ..., 0.0745, 0.0784, 0.0706],
          [0.2392, 0.2510, 0.2196,  ..., 0.0549, 0.0784, 0.0745],
          [0.2196, 0.2745, 0.2431,  ..., 0.0431, 0.0863, 0.0784]],
 
         [[0.1412, 0.0784, 0.0824,  ..., 0.9333, 0.9333, 0.9333],
          [0.1882, 0.0157, 0.0824,  ..., 0.9373, 0.9373, 0.9373],
          [0.3020, 0.1412, 0.04

In [1]:
import torch as t
import os
from pipeline import fate_torch_hook
from pipeline.component import HomoNN
from pipeline.component.homo_nn import DatasetParam, TrainerParam
from pipeline.backend.pipeline import PipeLine
from pipeline.component import Reader
from pipeline.interface import Data


fate_torch_hook(t)


fate_project_path = "/data/projects/fate/"
guest_0 = 9999
#host_1 = 9999
host_1 = 10000

pipeline = PipeLine().set_initiator(role='guest', party_id=guest_0).set_roles(guest=guest_0, host=host_1,
                                                                              arbiter=guest_0)
data_0 = {"name": "imdb", "namespace": "experiment"}
#data_path = fate_project_path + '/examples/data/IMDB.csv'
data_path = fate_project_path + 'examples/data/coco_smol/fedA/'

pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)
pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path)
reader_0 = Reader(name="reader_0")
reader_0.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)
reader_0.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)

reader_1 = Reader(name="reader_1")
reader_1.get_party_instance(role='guest', party_id=guest_0).component_param(table=data_0)
reader_1.get_party_instance(role='host', party_id=host_1).component_param(table=data_0)


## Add your pretriained model path here, will load model&tokenizer from this path


## LoraConfig
from peft import LoraConfig, TaskType
lora_config = LoraConfig(
    #task_type=TaskType.CAUSAL_LM,
    inference_mode=True, r=8, lora_alpha=32, lora_dropout=0.1,
    #target_modules='c_attn'
    target_modules=['v_proj','q_proj']
)


model_path =  '/data/projects/fate/fate/python/fate_llm/detr-resnet-50/'


model = t.nn.Sequential(
    t.nn.CustModel(module_name='pellm.detr', class_name='detr', pretrained_path=model_path,
                   peft_config=lora_config.to_dict(), peft_type="LoraConfig")#, num_labels=1,  pad_token_id=50256),
    #t.nn.CustModel(module_name='sigmoid', class_name='Sigmoid')
)

# DatasetParam
#dataset_param = DatasetParam(dataset_name='nlp_tokenizer',text_max_length=128, tokenizer_name_or_path=model_path, 
#                             padding_side="left", return_input_ids=False, pad_token='<|endoftext|>')
dataset_param = DatasetParam(dataset_name='detr_tokenizer')

# TrainerParam
trainer_param = TrainerParam(trainer_name='fedavg_detr_trainer', epochs=1, batch_size=1, 
                             checkpoint_save_freqs=1, pin_memory=False,
                             data_loader_worker=1, 
                             save_to_local_dir=True)

nn_component = HomoNN(name='nn_0', model=model)

# set parameter for client 1
nn_component.get_party_instance(role='guest', party_id=guest_0).component_param(
    #loss=t.nn.BCELoss(),
    #loss=t.nn.CrossEntropyLoss(),
    optimizer = t.optim.Adam(lr=0.0001, eps=1e-8),
    dataset=dataset_param,       
    trainer=trainer_param,
    torch_seed=100 
)

# set parameter for client 2
nn_component.get_party_instance(role='host', party_id=host_1).component_param(
    #loss=t.nn.BCELoss(),
    #loss=t.nn.CrossEntropyLoss(),
    optimizer = t.optim.Adam(lr=0.0001, eps=1e-8),
    dataset=dataset_param,       
    trainer=trainer_param,
    torch_seed=100 
)

# set parameter for server
nn_component.get_party_instance(role='arbiter', party_id=guest_0).component_param(    
    trainer=trainer_param
)

pipeline.add_component(reader_0)
pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))
pipeline.compile()

pipeline.fit()

  from .autonotebook import tqdm as notebook_tqdm
[32m2023-12-06 20:58:11.422[0m | [1mINFO    [0m | [36mpipeline.utils.invoker.job_submitter[0m:[36mmonitor_job_status[0m:[36m83[0m - [1mJob id is 202312062058108534040
[0m
[32m2023-12-06 20:58:11.465[0m | [1mINFO    [0m | [36mpipeline.utils.invoker.job_submitter[0m:[36mmonitor_job_status[0m:[36m98[0m - [1m[80D[1A[KJob is still waiting, time elapse: 0:00:00[0m
[0mm2023-12-06 20:58:12.489[0m | [1mINFO    [0m | [36mpipeline.utils.invoker.job_submitter[0m:[36mmonitor_job_status[0m:[36m125[0m - [1m
[32m2023-12-06 20:58:12.490[0m | [1mINFO    [0m | [36mpipeline.utils.invoker.job_submitter[0m:[36mmonitor_job_status[0m:[36m127[0m - [1m[80D[1A[KRunning component reader_0, time elapse: 0:00:01[0m
[32m2023-12-06 20:58:13.520[0m | [1mINFO    [0m | [36mpipeline.utils.invoker.job_submitter[0m:[36mmonitor_job_status[0m:[36m127[0m - [1m[80D[1A[KRunning component reader_0, time elapse: 0:

You can use this script to submit the model, but submitting the model will take a long time to train and generate a long log, so we won't do it here.

## Training with CUDA

You can use GPU by setting the cuda parameter of the FedAVGTrainer:

In [None]:
trainer_param = TrainerParam(trainer_name='fedavg_trainer', epochs=1, batch_size=8, 
                             data_loader_worker=8, cuda=0)

The cuda parameter here accepts an integer value that corresponds to the index of the GPU you want to use for training. 
In the example above, the value is set to 0, which means that on every client the first available GPU in the system will be used. 
If you have multiple GPUs and would like to use a specific one, simply change the value of the cuda parameter to the appropriate index.