In [181]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

In [322]:
file_name = 'isolet.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y = y-1
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=512, shuffle=False)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=False)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=False)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

isolet.mat (1560, 617) (1560,)
Client 0: Feature Index 0-205
Client 1: Feature Index 206-411
Server : Feature Index 412-616


fnn

In [327]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=16,
                            output_dim=output_dim, hidden_dims=[64, 32],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=100,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [328]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
95,0.095146,0.99359,0.99359,0.99359
96,0.089868,0.99359,0.99359,0.99359
97,0.084906,0.99359,0.99359,0.99359
98,0.080228,0.99359,0.995726,0.995726
99,0.075823,0.995726,0.995726,0.995726


stg

In [323]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=16,
                            output_dim=output_dim, hidden_dims=[64, 32],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=100, optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion, freeze_btm_till=0)

In [324]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
95,0.30849,0.905983,0.905983,0.848291,617
96,0.361167,0.888889,0.923077,0.867521,617
97,0.505015,0.82906,0.895299,0.92735,617
98,0.401509,0.886752,0.856838,0.871795,617
99,0.456985,0.854701,0.905983,0.893162,617


STG with GINI Initialization

In [348]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=16,
                            output_dim=output_dim, hidden_dims=[64, 32],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=100,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [349]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
95,0.290459,0.91453,0.897436,0.83547,529
96,0.366227,0.876068,0.878205,0.882479,530
97,0.239084,0.92735,0.92735,0.931624,531
98,0.268014,0.923077,0.916667,0.931624,531
99,0.257775,0.929487,0.848291,0.929487,532


Dual STG with GINI Initialization

In [295]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=16,
    output_dim=output_dim,
    hidden_dims=[64, 32],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=100,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000)

In [296]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
95,0.984781,0.651709,0.641026,0.722222,482,48
96,0.896572,0.694444,0.683761,0.720085,482,48
97,0.757438,0.737179,0.619658,0.754274,483,48
98,0.794914,0.760684,0.726496,0.769231,480,48
99,0.928594,0.685897,0.784188,0.754274,481,48


dual stg with longer training

In [350]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=16,
    output_dim=output_dim,
    hidden_dims=[64, 32],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=300,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [351]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
295,0.256805,0.963675,0.916667,0.931624,491,48
296,0.29173,0.942308,0.925214,0.878205,492,48
297,0.307084,0.940171,0.91453,0.92094,492,48
298,0.425683,0.897436,0.957265,0.903846,492,48
299,0.383909,0.916667,0.965812,0.965812,492,48


SFFS Filtered (0.5)

In [303]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 0.055849552154541016


In [304]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(1560, 308)


In [305]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=512, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-102
Client 1: Feature Index 103-205
Server : Feature Index 206-307


In [306]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=16,
                            output_dim=output_dim, hidden_dims=[64, 32],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=100,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [307]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
95,0.340682,0.918803,0.923077,0.923077
96,0.328053,0.923077,0.92735,0.92735
97,0.315901,0.92735,0.929487,0.929487
98,0.304248,0.929487,0.931624,0.931624
99,0.293004,0.931624,0.933761,0.933761


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 617        | 0.9807   | 1               |
| STG                   | 617        | 0.9316   | 1               |
| STG+GINI              | 509        | 0.9465   | 1               |
| DualSTG+GINI          | 481        | 0.7542   | 1               |
| DualSTG+GINI (double) | 467        | 0.9658   | 1               |
| SFFS->FNN             | 308        | 0.933761 | 1               |

In [353]:
import dill
dill.dump_session('IsoletDataExperiments.db')