In [12]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

In [13]:
file_name = 'gisette.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y[np.where(y == -1)] = 0
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=512, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

gisette.mat (7000, 5000) (7000,)
Client 0: Feature Index 0-1666
Client 1: Feature Index 1667-3333
Server : Feature Index 3334-4999


fnn

In [32]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40, optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [33]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.003317,1.0,1.0,1.0
36,0.003226,1.0,1.0,1.0
37,0.002888,1.0,1.0,1.0
38,0.002647,1.0,1.0,1.0
39,0.002615,1.0,1.0,1.0


stg

In [38]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [39]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.320679,0.998437,0.998667,0.998,4085
36,0.401586,0.992969,0.984333,0.984,4078
37,0.319778,0.997266,0.998333,0.99,4073
38,0.326492,0.997266,0.996,0.998333,4057
39,0.303505,0.998828,0.999,0.995333,4048


STG with GINI Initialization

In [41]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [42]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.379172,0.989904,0.996667,0.994667,2990
36,0.370927,0.992248,0.996,0.990667,2987
37,0.438286,0.9875,0.999667,0.999,2971
38,0.343667,0.996484,0.994333,0.985,2958
39,0.33279,0.995703,0.999,0.998,2938


Dual STG with GINI Initialization

In [49]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[32, 16],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=40,
    lr=0.01,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [50]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
35,0.603382,0.998047,0.997,0.996667,2952,19
36,0.632965,0.994922,0.994667,0.999,2952,18
37,0.604519,0.998047,0.993,0.996333,2948,18
38,0.597055,0.999219,0.995,0.99,2939,18
39,0.650317,0.994531,0.995333,0.996,2930,17


dual stg with longer training

In [51]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[32, 16],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=80,

    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [52]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
75,0.566869,0.994531,0.987667,0.998,2673,12
76,0.596418,0.991016,0.990667,0.996667,2670,12
77,0.593962,0.992578,0.989667,0.986333,2664,12
78,0.681796,0.983654,0.999667,0.989,2667,12
79,0.566594,0.993359,0.991333,0.992333,2670,12


SFFS Filtered (0.5)

In [53]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 34.09199905395508


  f_statistics[j] = theta_param[j] ** 2 / diag_x[j]
  f_statistics[j] = theta_param[j] ** 2 / diag_x[j]


In [55]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(7000, 2500)


In [56]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=512, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-833
Client 1: Feature Index 834-1666
Server : Feature Index 1667-2499


In [79]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [80]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.007199,1.0,1.0,1.0
36,0.007477,1.0,1.0,1.0
37,0.006844,1.0,1.0,1.0
38,0.005121,1.0,1.0,1.0
39,0.005115,1.0,1.0,1.0


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 5000       | 1        | 1               |
| STG                   | 4048       | 0.9953   | 1               |
| STG+GINI              | 2938       | 0.9980   | 1               |
| DualSTG+GINI          | 2930       | 0.9960   | 0.70            |
| DualSTG+GINI (double) | 2670       | 0.9923   | 0.5             |
| SFFS->FNN             | 2500       | 1        | 1               |

In [82]:
import dill
dill.dump_session('GisetteDataExperiments.db')