In [110]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

In [111]:
file_name = 'BASEHOCK.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y[np.where(y == 1)] = 0
y[np.where(y == 2)] = 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

BASEHOCK.mat (1993, 4862) (1993,)
Client 0: Feature Index 0-1620
Client 1: Feature Index 1621-3241
Server : Feature Index 3242-4861


fnn

In [122]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40, optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [123]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.264346,0.991054,0.994987,0.994987
36,0.223477,0.99455,0.997494,0.997494
37,0.195091,0.998047,0.997494,0.997494
38,0.166716,0.996503,0.997494,0.997494
39,0.142199,0.996503,0.997494,0.997494


stg

In [131]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [132]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.127541,0.998047,1.0,0.994987,2551
36,0.105532,1.0,1.0,0.997494,2498
37,0.10439,1.0,0.997494,1.0,2440
38,0.104029,1.0,0.997494,0.994987,2383
39,0.168985,0.989101,0.997494,0.992481,2345


STG with GINI Initialization

In [133]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [134]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.115014,0.996503,0.997494,0.997494,2686
36,0.104383,1.0,0.997494,0.997494,2679
37,0.102884,1.0,0.994987,1.0,2675
38,0.106704,0.998047,0.994987,0.997494,2667
39,0.109761,0.998047,0.994987,1.0,2664


Dual STG with GINI Initialization

In [148]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[16, 8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=40,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [149]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
35,0.232342,1.0,0.997494,1.0,2725,21
36,0.230228,1.0,0.992481,1.0,2718,21
37,0.231067,1.0,1.0,0.994987,2712,21
38,0.233216,0.998047,1.0,1.0,2707,21
39,0.224875,1.0,0.994987,1.0,2703,21


dual stg with longer training

In [152]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[16, 8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=80,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [153]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
75,0.186306,1.0,1.0,0.997494,1524,14
76,0.300532,0.99455,0.997494,0.997494,1515,14
77,0.182978,1.0,1.0,1.0,1498,14
78,0.186815,1.0,0.997494,1.0,1491,14
79,0.228558,0.993007,1.0,1.0,1481,13


SFFS Filtered (0.5)

In [159]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 31.09385585784912


In [161]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(1993, 2431)


In [167]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-810
Client 1: Feature Index 811-1620
Server : Feature Index 1621-2430


In [168]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [169]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.576431,0.981288,0.984962,0.984962
36,0.514336,0.990644,0.992481,0.992481
37,0.45934,0.991054,0.992481,0.992481
38,0.407137,0.992597,0.994987,0.994987
39,0.350671,0.996094,0.994987,0.994987


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 4862       | 0.9974   | 1               |
| STG                   | 2345       | 0.9924   | 1               |
| STG+GINI              | 2664       | __1__        | 1               |
| DualSTG+GINI          | 2703       | __1__        | __0.875__           |
| DualSTG+GINI (double) | 1481       | __1__        | __0.5417__          |
| SFFS->FNN             | 2431       | 0.9949   | 1               |

In [None]:
import dill
dill.dump_session('BaseHockDataExperiments.db')