In [1]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"
file_name = 'BASEHOCK.mat'

In [3]:
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y[np.where(y == 1)] = 0
y[np.where(y == 2)] = 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=False)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=False)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=False)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

BASEHOCK.mat (1993, 4862) (1993,)
Client 0: Feature Index 0-1620
Client 1: Feature Index 1621-3241
Server : Feature Index 3242-4861


fnn

In [29]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[8],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [30]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
25,0.932573,0.936148,0.942356,0.942356
26,0.893441,0.939644,0.957393,0.957393
27,0.842558,0.952906,0.957393,0.957393
28,0.790051,0.962576,0.967419,0.967419
29,0.736935,0.968436,0.969925,0.969925


stg

In [31]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[8],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [32]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
25,0.120966,1.0,1.0,1.0,3852
26,0.119726,1.0,1.0,1.0,3714
27,0.120107,1.0,0.997494,0.997494,3612
28,0.123952,1.0,1.0,1.0,3517
29,0.123778,0.998047,1.0,0.997494,3433


STG with GINI Initialization

In [33]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[8],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [34]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
25,0.138072,0.996503,0.997494,1.0,3042
26,0.121633,0.998047,1.0,1.0,3020
27,0.118509,1.0,0.997494,0.997494,2995
28,0.118073,1.0,1.0,0.997494,2970
29,0.127384,0.996094,1.0,1.0,2952


Dual STG with GINI Initialization

In [36]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=4,
    output_dim=output_dim,
    hidden_dims=[8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=30,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [37]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
25,0.293749,0.990234,0.997494,0.989975,3035,11
26,0.301293,0.992597,0.994987,0.994987,3030,11
27,0.260955,1.0,0.997494,0.984962,3018,10
28,0.305416,0.993007,0.997494,1.0,3013,10
29,0.252432,1.0,0.989975,1.0,3000,10


dual stg with longer training

In [39]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=4,
    output_dim=output_dim,
    hidden_dims=[8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=60,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [40]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
55,0.23965,0.992188,0.997494,1.0,2076,9
56,0.216584,1.0,1.0,1.0,2061,9
57,0.228709,0.998047,0.997494,0.997494,2040,9
58,0.216636,1.0,1.0,1.0,2018,9
59,0.213594,1.0,1.0,1.0,1998,9


SFFS Filtered (0.5)

In [41]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 30.316142797470093


In [42]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(1993, 2431)


In [43]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-810
Client 1: Feature Index 811-1620
Server : Feature Index 1621-2430


In [44]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[8],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [45]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
25,1.129539,0.807119,0.827068,0.827068
26,1.10195,0.838683,0.85213,0.85213
27,1.069046,0.862434,0.879699,0.879699
28,1.039511,0.877336,0.907268,0.907268
29,0.998699,0.921247,0.927318,0.927318


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 4862       | 0.9669   | 1               |
| STG                   | 3433       | 0.9974   | 1               |
| STG+GINI              | 2952       | 1        | 1               |
| DualSTG+GINI          | 3000       | 1        | 0.833           |
| DualSTG+GINI (double) | 1998       | 1        | 0.75            |
| SFFS->FNN             | 2431       | 0.9273   | 1               |

In [46]:
import dill
dill.dump_session('RELATHEDataExperiments.db')