In [1]:
import time
import secretflow as sf
import spu
import os

start = time.perf_counter()

network_conf = {
    "parties": {
        "alice": {
            "address": "alice:8000",
        },
        "bob": {
            "address": "bob:8000",
        },
    },
}

party = os.getenv("SELF_PARTY", "alice")
sf.shutdown()
sf.init(
    address="127.0.0.1:6379",
    cluster_config={**network_conf, "self_party": party},
    log_to_driver=True,
)

2024-06-21 13:59:12,943	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 172.20.0.3:6379...
2024-06-21 13:59:12,950	INFO worker.py:1724 -- Connected to Ray cluster.
2024-06-21 13:59:12.966 INFO api.py:233 [bob] -- [Anonymous_job] Started rayfed with {'CLUSTER_ADDRESSES': {'alice': 'alice:8000', 'bob': 'bob:8000'}, 'CURRENT_PARTY_NAME': 'bob', 'TLS_CONFIG': {}}
2024-06-21 13:59:13.587 INFO barriers.py:284 [bob] -- [Anonymous_job] Succeeded to create receiver proxy actor.
[36m(ReceiverProxyActor pid=8928)[0m 2024-06-21 13:59:13.583 INFO grpc_proxy.py:359 [bob] -- [Anonymous_job] ReceiverProxy binding port 8000, options: (('grpc.enable_retries', 1), ('grpc.so_reuseport', 0), ('grpc.max_send_message_length', 524288000), ('grpc.max_receive_message_length', 524288000), ('grpc.service_config', '{"methodConfig": [{"name": [{"service": "GrpcService"}], "retryPolicy": {"maxAttempts": 5, "initialBackoff": "5s", "maxBackoff": "30s", "backoffMultiplier": 2, "retryableStatusCo

2024-06-21 13:59:12,947	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 172.20.0.2:6379...
2024-06-21 13:59:12,956	INFO worker.py:1724 -- Connected to Ray cluster.
2024-06-21 13:59:12.973 INFO api.py:233 [alice] -- [Anonymous_job] Started rayfed with {'CLUSTER_ADDRESSES': {'alice': 'alice:8000', 'bob': 'bob:8000'}, 'CURRENT_PARTY_NAME': 'alice', 'TLS_CONFIG': {}}
2024-06-21 13:59:13.597 INFO barriers.py:284 [alice] -- [Anonymous_job] Succeeded to create receiver proxy actor.
[36m(ReceiverProxyActor pid=28339)[0m 2024-06-21 13:59:13.594 INFO grpc_proxy.py:359 [alice] -- [Anonymous_job] ReceiverProxy binding port 8000, options: (('grpc.enable_retries', 1), ('grpc.so_reuseport', 0), ('grpc.max_send_message_length', 524288000), ('grpc.max_receive_message_length', 524288000), ('grpc.service_config', '{"methodConfig": [{"name": [{"service": "GrpcService"}], "retryPolicy": {"maxAttempts": 5, "initialBackoff": "5s", "maxBackoff": "30s", "backoffMultiplier": 2, "retryabl

In [2]:
alice, bob = sf.PYU("alice"), sf.PYU("bob")
spu_conf = {
    "nodes": [
        {
            "party": "alice",
            "address": "alice:8001",
            "listen_addr": "alice:8001",
        },
        {
            "party": "bob",
            "address": "bob:8001",
            "listen_addr": "bob:8001",
        },
    ],
    "runtime_config": {
        "protocol": spu.spu_pb2.SEMI2K,
        "field": spu.spu_pb2.FM128,
        "sigmoid_mode": spu.spu_pb2.RuntimeConfig.SIGMOID_REAL,
    },
}

heu_config = {
    'sk_keeper': {'party': 'alice'},
    'evaluators': [{'party': 'bob'}],
    'mode': 'PHEU',
    'he_parameters': {
        # ou is a fast encryption schema that is as secure as paillier.
        'schema': 'ou',
        'key_pair': {
            'generate': {
                # bit size should be 2048 to provide sufficient security.
                'bit_size': 2048,
            },
        },
    },
    'encoding': {
        'cleartext_type': 'DT_I32',
        'encoder': "IntegerEncoder",
        'encoder_args': {"scale": 1},
    },
}
heu = sf.HEU(heu_config, spu_conf['runtime_config']['field'])
spu_device = sf.SPU(cluster_def=spu_conf)

In [3]:
import pandas as pd
import os
from secretflow.data.vertical import read_csv as v_read_csv, VDataFrame
from secretflow.data.core import partition

current_dir = os.getcwd()
# load alice and bob data as a single vdataframe
data = v_read_csv(
    {alice: f"{current_dir}/my_bank_0_15.csv", bob: f"{current_dir}/my_bank_16.csv"},
    keys="id",
    drop_keys="id",
)
# load alice's label data
alice_y_pyu_object = alice(lambda path: pd.read_csv(path, index_col = 0))(f"{current_dir}/bank_y.csv") 
label = VDataFrame(partitions={alice: partition(alice_y_pyu_object)})

2024-06-21 13:59:17.031 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-06-21 13:59:17.098 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-06-21 13:59:18.724 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.


2024-06-21 13:59:17.546 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-06-21 13:59:17.548 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-06-21 13:59:18.717 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.


In [4]:
data.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [5]:
label.columns

['y']

['y']

In [6]:
# from data descryptions we know we need to encode data
from secretflow.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['job'] = encoder.fit_transform(data['job'])
data['marital'] = encoder.fit_transform(data['marital'])
data['education'] = encoder.fit_transform(data['education'])
data['default'] = encoder.fit_transform(data['default'])
data['housing'] = encoder.fit_transform(data['housing'])
data['loan'] = encoder.fit_transform(data['loan'])
data['contact'] = encoder.fit_transform(data['contact'])
data['poutcome'] = encoder.fit_transform(data['poutcome'])
data['month'] = encoder.fit_transform(data['month'])
label = encoder.fit_transform(label)


In [7]:
from secretflow.data.split import train_test_split as train_test_split_fed


In [8]:
X_train_fed, X_test_fed = train_test_split_fed(data, test_size=0.2, random_state=94)


In [9]:
y_train_fed, y_test_fed = train_test_split_fed(label, test_size=0.2, random_state=94)

In [10]:
# from secretflow.ml.boost.sgb_v import (
#     get_classic_XGB_params,
#     Sgb,
# )
from secretflow.ml.boost.ss_xgb_v import Xgb

In [11]:
xgb=Xgb(spu_device)
params = {
    # for more detail, see Xgb API doc
    'num_boost_round': 5,
    'max_depth': 5,
    'learning_rate': 0.1,
    'sketch_eps': 0.08,
    'objective': 'logistic',
    'reg_lambda': 0.1,
    'subsample': 1,
    'colsample_by_tree': 1,
    'base_score': 0.5,
}
model = xgb.train(params, X_train_fed, y_train_fed)

2024-06-21 13:59:23.613 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 13:59:23.614 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-06-21 13:59:23.638 INFO model.py:239 [bob] -- [Anonymous_job] fragment_count 1
2024-06-21 13:59:23.754 INFO model.py:257 [bob] -- [Anonymous_job] prepare time 1.0911757946014404s
2024-06-21 13:59:24.689 INFO model.py:270 [bob] -- [Anonymous_job] global_setup time 0.934262752532959s
2024-06-21 13:59:25.238 INFO model.py:289 [bob] -- [Anonymous_job] build & infeed bucket_map fragments [0, 0]
2024-06-21 13:59:25.239 INFO model.py:292 [bob] -- [Anonymous_job] build & infeed bucket_map time 0.549368143081665s
2024-06-21 13:59:25.271 INFO model.py:305 [bob] -- [Anonymous_job] init_pred time 0.03157997131347656s
2024-06-21 13:59:25.400 INFO model.p

2024-06-21 13:59:23.611 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 13:59:23.637 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-06-21 13:59:23.638 INFO model.py:239 [alice] -- [Anonymous_job] fragment_count 1
2024-06-21 13:59:23.757 INFO model.py:257 [alice] -- [Anonymous_job] prepare time 1.0998330116271973s
2024-06-21 13:59:24.686 INFO model.py:270 [alice] -- [Anonymous_job] global_setup time 0.927753210067749s
2024-06-21 13:59:25.240 INFO model.py:289 [alice] -- [Anonymous_job] build & infeed bucket_map fragments [0, 0]
2024-06-21 13:59:25.241 INFO model.py:292 [alice] -- [Anonymous_job] build & infeed bucket_map time 0.554624080657959s
2024-06-21 13:59:25.274 INFO model.py:305 [alice] -- [Anonymous_job] init_pred time 0.03257489204406738s
2024-06-21 13:59:25.

In [12]:
from secretflow.device.driver import reveal
from sklearn.metrics import roc_auc_score

# we reveal and look at the evaluation score in cleartext, but there are safer alternatives

print(
    "train set AUC score: ",
    roc_auc_score(reveal(y_train_fed.partitions[alice].data), reveal(model.predict(X_train_fed))),
    "test set AUC score: ",
    roc_auc_score(reveal(y_test_fed.partitions[alice].data), reveal(model.predict(X_test_fed))),
)

2024-06-21 14:00:22.755 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 14:00:22.755 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-06-21 14:00:26.690 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 14:00:26.690 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9027875524455824 test set AUC score:  0.892408765690599


2024-06-21 14:00:22.757 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 14:00:22.786 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-06-21 14:00:26.693 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-06-21 14:00:26.720 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9027875524455824 test set AUC score:  0.892408765690599


In [13]:
time_elapse = time.perf_counter() - start
time_elapse

76.11852605100012

76.11653968600058

In [14]:
# Questions:
# 1. Produce this task using ss-XGB ?
# 2. How much gain does ss-XGB achieve?
# 3. What about if Alice has 15 features while Bob has 1 feature? Can SGB and ss-XGB outperform alice's single party XGB's model?

# 1. done
# 2. train:0.9027049742255001
#    test:0.8924150097847268
# 3. alice's single party XGB's model performs better
#    ss_xgb time consuming: more than 76s
#       train set AUC score:  0.9027875524455824 test set AUC score:  0.892408765690599
#    sgb time consuming: 55(given by file "SGB 练习——联合建模有数据价值吗？.ipynb" )
#       train set AUC score:  0.918428687751083 test set AUC score:  0.9060391606457412
#    single party xgb: time: very fast
#       train set AUC score:  0.9281303699212398 test set AUC score:  0.9116829545018299 num_trees:  20