In [1]:
import time
import secretflow as sf
import spu
import os

network_conf = {
    "parties": {
        "alice": {
            "address": "alice:8000",
        },
        "bob": {
            "address": "bob:8000",
        },
    },
}

party = os.getenv("SELF_PARTY", "alice")
sf.shutdown()
sf.init(
    address="127.0.0.1:6379",
    cluster_config={**network_conf, "self_party": party},
    log_to_driver=True,
)

2024-08-08 05:31:23,782	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 172.19.0.2:6379...
2024-08-08 05:31:23,789	INFO worker.py:1724 -- Connected to Ray cluster.
2024-08-08 05:31:23.805 INFO api.py:233 [bob] -- [Anonymous_job] Started rayfed with {'CLUSTER_ADDRESSES': {'alice': 'alice:8000', 'bob': 'bob:8000'}, 'CURRENT_PARTY_NAME': 'bob', 'TLS_CONFIG': {}}
2024-08-08 05:31:24.424 INFO barriers.py:284 [bob] -- [Anonymous_job] Succeeded to create receiver proxy actor.
[36m(ReceiverProxyActor pid=4388)[0m 2024-08-08 05:31:24.421 INFO grpc_proxy.py:359 [bob] -- [Anonymous_job] ReceiverProxy binding port 8000, options: (('grpc.enable_retries', 1), ('grpc.so_reuseport', 0), ('grpc.max_send_message_length', 524288000), ('grpc.max_receive_message_length', 524288000), ('grpc.service_config', '{"methodConfig": [{"name": [{"service": "GrpcService"}], "retryPolicy": {"maxAttempts": 5, "initialBackoff": "5s", "maxBackoff": "30s", "backoffMultiplier": 2, "retryableStatusCo

2024-08-08 05:31:23,798	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 172.19.0.3:6379...
2024-08-08 05:31:23,805	INFO worker.py:1724 -- Connected to Ray cluster.
2024-08-08 05:31:23.822 INFO api.py:233 [alice] -- [Anonymous_job] Started rayfed with {'CLUSTER_ADDRESSES': {'alice': 'alice:8000', 'bob': 'bob:8000'}, 'CURRENT_PARTY_NAME': 'alice', 'TLS_CONFIG': {}}
2024-08-08 05:31:24.450 INFO barriers.py:284 [alice] -- [Anonymous_job] Succeeded to create receiver proxy actor.
[36m(ReceiverProxyActor pid=5614)[0m 2024-08-08 05:31:24.446 INFO grpc_proxy.py:359 [alice] -- [Anonymous_job] ReceiverProxy binding port 8000, options: (('grpc.enable_retries', 1), ('grpc.so_reuseport', 0), ('grpc.max_send_message_length', 524288000), ('grpc.max_receive_message_length', 524288000), ('grpc.service_config', '{"methodConfig": [{"name": [{"service": "GrpcService"}], "retryPolicy": {"maxAttempts": 5, "initialBackoff": "5s", "maxBackoff": "30s", "backoffMultiplier": 2, "retryable

In [2]:
alice, bob = sf.PYU("alice"), sf.PYU("bob")
spu_conf = {
    "nodes": [
        {
            "party": "alice",
            "address": "alice:8001",
            "listen_addr": "alice:8001",
        },
        {
            "party": "bob",
            "address": "bob:8001",
            "listen_addr": "bob:8001",
        },
    ],
    "runtime_config": {
        "protocol": spu.spu_pb2.SEMI2K,
        "field": spu.spu_pb2.FM128,
        "sigmoid_mode": spu.spu_pb2.RuntimeConfig.SIGMOID_REAL,
    },
}

heu_config = {
    'sk_keeper': {'party': 'alice'},
    'evaluators': [{'party': 'bob'}],
    'mode': 'PHEU',
    'he_parameters': {
        # ou is a fast encryption schema that is as secure as paillier.
        'schema': 'ou',
        'key_pair': {
            'generate': {
                # bit size should be 2048 to provide sufficient security.
                'bit_size': 2048,
            },
        },
    },
    'encoding': {
        'cleartext_type': 'DT_I32',
        'encoder': "IntegerEncoder",
        'encoder_args': {"scale": 1},
    },
}
heu = sf.HEU(heu_config, spu_conf['runtime_config']['field'])
spu_device = sf.SPU(cluster_def=spu_conf)

In [3]:
import pandas as pd
import os
from secretflow.data.vertical import read_csv as v_read_csv, VDataFrame
from secretflow.data.core import partition

start = time.perf_counter()

current_dir = os.getcwd()
# load alice and bob data as a single vdataframe
data = v_read_csv(
    {alice: f"{current_dir}/bank_0_8.csv", bob: f"{current_dir}/bank_8_16.csv"},
    keys="id",
    drop_keys="id",
)
# load alice's label data
alice_y_pyu_object = alice(lambda path: pd.read_csv(path, index_col = 0))(f"{current_dir}/bank_y.csv") 
label = VDataFrame(partitions={alice: partition(alice_y_pyu_object)})

2024-08-08 05:31:26.896 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:31:26.965 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-08-08 05:31:28.627 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.


2024-08-08 05:31:27.432 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:31:27.433 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-08-08 05:31:28.622 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.


In [4]:
data.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [5]:
label.columns

['y']

['y']

In [6]:
# from data descryptions we know we need to encode data
from secretflow.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['job'] = encoder.fit_transform(data['job'])
data['marital'] = encoder.fit_transform(data['marital'])
data['education'] = encoder.fit_transform(data['education'])
data['default'] = encoder.fit_transform(data['default'])
data['housing'] = encoder.fit_transform(data['housing'])
data['loan'] = encoder.fit_transform(data['loan'])
data['contact'] = encoder.fit_transform(data['contact'])
data['poutcome'] = encoder.fit_transform(data['poutcome'])
data['month'] = encoder.fit_transform(data['month'])
label = encoder.fit_transform(label)


In [7]:
from secretflow.data.split import train_test_split as train_test_split_fed


In [8]:
X_train_fed, X_test_fed = train_test_split_fed(data, test_size=0.2, random_state=94)


In [9]:
y_train_fed, y_test_fed = train_test_split_fed(label, test_size=0.2, random_state=94)

In [10]:
# from secretflow.ml.boost.sgb_v import (
#     get_classic_XGB_params,
#     Sgb,
# )
from secretflow.ml.boost.ss_xgb_v import Xgb

In [11]:
xgb=Xgb(spu_device)
params = {
    # for more detail, see Xgb API doc
    'num_boost_round': 5,
    'max_depth': 5,
    'learning_rate': 0.1,
    'sketch_eps': 0.08,
    'objective': 'logistic',
    'reg_lambda': 0.1,
    'subsample': 1,
    'colsample_by_tree': 1,
    'base_score': 0.5,
}
model = xgb.train(params, X_train_fed, y_train_fed)

2024-08-08 05:31:33.215 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:31:33.216 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:31:33.242 INFO booster.py:167 [bob] -- [Anonymous_job] fragment_count 1
2024-08-08 05:31:33.307 INFO booster.py:185 [bob] -- [Anonymous_job] prepare time 0.8302707672119141s
2024-08-08 05:31:34.283 INFO booster.py:198 [bob] -- [Anonymous_job] global_setup time 0.9754617214202881s
2024-08-08 05:31:34.721 INFO booster.py:217 [bob] -- [Anonymous_job] build & infeed bucket_map fragments [0, 0]
2024-08-08 05:31:34.722 INFO booster.py:220 [bob] -- [Anonymous_job] build & infeed bucket_map time 0.4381988048553467s
2024-08-08 05:31:34.756 INFO booster.py:233 [bob] -- [Anonymous_job] init_pred time 0.03279685974121094s
2024-08-08 05:31:34.85

2024-08-08 05:31:33.214 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:31:33.240 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:31:33.241 INFO booster.py:167 [alice] -- [Anonymous_job] fragment_count 1
2024-08-08 05:31:33.310 INFO booster.py:185 [alice] -- [Anonymous_job] prepare time 0.8378827571868896s
2024-08-08 05:31:34.283 INFO booster.py:198 [alice] -- [Anonymous_job] global_setup time 0.9708831310272217s
2024-08-08 05:31:34.724 INFO booster.py:217 [alice] -- [Anonymous_job] build & infeed bucket_map fragments [0, 0]
2024-08-08 05:31:34.724 INFO booster.py:220 [alice] -- [Anonymous_job] build & infeed bucket_map time 0.4411287307739258s
2024-08-08 05:31:34.758 INFO booster.py:233 [alice] -- [Anonymous_job] init_pred time 0.033274173736572266s
2024-

In [12]:
from secretflow.device.driver import reveal
from sklearn.metrics import roc_auc_score

# we reveal and look at the evaluation score in cleartext, but there are safer alternatives

print(
    "train set AUC score: ",
    roc_auc_score(reveal(y_train_fed.partitions[alice].data), reveal(model.predict(X_train_fed))),
    "test set AUC score: ",
    roc_auc_score(reveal(y_test_fed.partitions[alice].data), reveal(model.predict(X_test_fed))),
)

2024-08-08 05:32:32.615 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:32.656 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:32:37.381 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:37.412 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9026768922703838 test set AUC score:  0.892436806298487


2024-08-08 05:32:32.611 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:32.613 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:32:37.379 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:37.380 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9026768922703838 test set AUC score:  0.892436806298487


In [13]:
time_elapse = time.perf_counter() - start
time_elapse

71.79309329799435

72.33220342900313

# 更改特征分布，alice有15个，bob只有1个

In [14]:
start=time.perf_counter()
current_dir = os.getcwd()
# load alice and bob data as a single vdataframe
data = v_read_csv(
    {alice: f"{current_dir}/my_bank_0_15.csv", bob: f"{current_dir}/my_bank_16.csv"},
    keys="id",
    drop_keys="id",
)
# load alice's label data
alice_y_pyu_object = alice(lambda path: pd.read_csv(path, index_col = 0))(f"{current_dir}/bank_y.csv") 
label = VDataFrame(partitions={alice: partition(alice_y_pyu_object)})

encoder = LabelEncoder()
data['job'] = encoder.fit_transform(data['job'])
data['marital'] = encoder.fit_transform(data['marital'])
data['education'] = encoder.fit_transform(data['education'])
data['default'] = encoder.fit_transform(data['default'])
data['housing'] = encoder.fit_transform(data['housing'])
data['loan'] = encoder.fit_transform(data['loan'])
data['contact'] = encoder.fit_transform(data['contact'])
data['poutcome'] = encoder.fit_transform(data['poutcome'])
data['month'] = encoder.fit_transform(data['month'])
label = encoder.fit_transform(label)

X_train_fed, X_test_fed = train_test_split_fed(data, test_size=0.2, random_state=94)
y_train_fed, y_test_fed = train_test_split_fed(label, test_size=0.2, random_state=94)

xgb=Xgb(spu_device)
params = {
    # for more detail, see Xgb API doc
    'num_boost_round': 5,
    'max_depth': 5,
    'learning_rate': 0.1,
    'sketch_eps': 0.08,
    'objective': 'logistic',
    'reg_lambda': 0.1,
    'subsample': 1,
    'colsample_by_tree': 1,
    'base_score': 0.5,
}
new_model = xgb.train(params, X_train_fed, y_train_fed)

2024-08-08 05:32:39.235 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:32:39.237 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-08-08 05:32:40.672 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:32:42.053 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:42.054 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:32:42.084 INFO booster.py:167 [bob] -- [Anonymous_job] fragment_count 1
2024-08-08 05:32:42.148 INFO booster.py:185 [bob] -- [Anonymous_job] prepare time 0.133591890335083s
2

2024-08-08 05:32:39.239 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:32:39.331 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party bob.
2024-08-08 05:32:40.678 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.data.core.agent.PartitionAgent'> with party alice.
2024-08-08 05:32:42.051 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:32:42.083 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:32:42.084 INFO booster.py:167 [alice] -- [Anonymous_job] fragment_count 1
2024-08-08 05:32:42.151 INFO booster.py:185 [alice] -- [Anonymous_job] prepare time 0.1418

In [15]:
print(
    "train set AUC score: ",
    roc_auc_score(reveal(y_train_fed.partitions[alice].data), reveal(new_model.predict(X_train_fed))),
    "test set AUC score: ",
    roc_auc_score(reveal(y_test_fed.partitions[alice].data), reveal(new_model.predict(X_test_fed))),
)
time_elapse = time.perf_counter() - start
time_elapse

2024-08-08 05:33:44.871 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:33:44.872 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:33:47.875 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:33:47.876 INFO proxy.py:180 [bob] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9026830677692621 test set AUC score:  0.8923910162748845


70.34834145300556

2024-08-08 05:33:44.875 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:33:44.903 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.
2024-08-08 05:33:47.877 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party alice.
2024-08-08 05:33:47.903 INFO proxy.py:180 [alice] -- [Anonymous_job] Create proxy actor <class 'secretflow.ml.boost.ss_xgb_v.core.tree_worker.XgbTreeWorker'> with party bob.


train set AUC score:  0.9026830677692621 test set AUC score:  0.8923910162748845


70.34707892101142

Questions:

1\. Produce this task using ss-XGB ?

2\. How much gain does ss-XGB achieve?

3\. What about if Alice has 15 features while Bob has 1 feature? Can SGB and ss-XGB outperform alice's single party XGB's model?

Ans:

1\. done

2. 

```
train:0.9027049742255001

test:0.8924150097847268
```

3\. alice's single party XGB's model performs better

(1) ss_xgb time consuming: more than 76s

train set AUC score:0.9026768922703838 test set AUC score: 0.892436806298487

(2) sgb time consuming: 37.186125394000555 (given by file "SGB 练习——联合建模有数据价值吗？.ipynb" )

train set AUC score:  0.9015009566774648 test set AUC score:  0.8932670395547175

(3) single party xgb: total time: 0.3238701200025389(given by file "SGB alice own game.ipynb" )

train set AUC score: 0.9281303699212398 test set AUC score: 0.9116829545018299 num_trees: 20