In [1]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

In [249]:
file_name = 'COIL20.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y = y-1
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=128, shuffle=False)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=False)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=False)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()
print(output_dim)

COIL20.mat (1440, 1024) (1440,)
Client 0: Feature Index 0-341
Client 1: Feature Index 342-682
Server : Feature Index 683-1023
20


fnn

In [210]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=50,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [211]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
45,0.635173,0.966797,0.960648,0.960648
46,0.600534,0.966797,0.965278,0.965278
47,0.568905,0.96875,0.965278,0.965278
48,0.539744,0.96875,0.965278,0.965278
49,0.512747,0.96875,0.965278,0.965278


stg

In [212]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=50,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [213]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
45,1.318367,0.910807,0.886574,0.93287,1019
46,0.995346,0.945312,0.949074,0.916667,1019
47,1.728826,0.857422,0.784722,0.895833,1019
48,2.939314,0.85026,0.789352,0.775463,1021
49,2.527475,0.828125,0.875,0.916667,1021


STG with GINI Initialization

In [214]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=50, optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [215]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
45,0.827643,0.945312,0.939815,0.960648,762
46,1.04558,0.924479,0.895833,0.900463,762
47,1.581529,0.874349,0.895833,0.921296,762
48,1.290623,0.916667,0.884259,0.759259,759
49,1.002702,0.932292,0.981481,0.976852,759


Dual STG with GINI Initialization

In [262]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[32, 16],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=50,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [263]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
45,3.002106,0.763672,0.791667,0.768519,704,23
46,3.072714,0.779297,0.733796,0.766204,703,23
47,2.832458,0.803385,0.747685,0.831019,703,23
48,4.018395,0.65625,0.675926,0.768519,706,23
49,3.032515,0.766276,0.842593,0.849537,712,23


dual stg with longer training

In [250]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[32, 16],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=100,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [251]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
95,1.527802,0.91276,0.953704,0.951389,731,24
96,1.282386,0.9375,0.893519,0.939815,731,24
97,1.188305,0.936198,0.939815,0.956019,729,24
98,1.423941,0.927083,0.981481,0.902778,730,24
99,1.0228,0.963542,0.958333,0.974537,731,24


SFFS Filtered (0.5)

In [239]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 0.1934821605682373


In [240]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(1440, 512)


In [241]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.3)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=False)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=False)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=False)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-170
Client 1: Feature Index 171-341
Server : Feature Index 342-511


In [246]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[32, 16],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [247]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,3.171622,0.580966,0.608796,0.608796
36,3.002694,0.641158,0.671296,0.671296
37,2.837856,0.675249,0.673611,0.673611
38,2.678243,0.681996,0.673611,0.673611
39,2.524619,0.68022,0.673611,0.673611


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 1024       | 0.965278 | 1               |
| STG                   | 1021       | 0.916667 | 1               |
| STG+GINI              | 759        | 0.9768   | 1               |
| DualSTG+GINI          | 685        | 0.8495   | 1               |
| DualSTG+GINI (double) | 731        | 0.9745   | 1               |
| SFFS->FNN             | 512        | 0.6736   | 1               |

In [264]:
import dill
dill.dump_session('COIL20DataExperiments.db')