In [60]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

In [61]:
file_name = 'PCMAC.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y[np.where(y == 1)] = 0
y[np.where(y == 2)] = 1
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=False)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=False)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=False)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

PCMAC.mat (1943, 3289) (1943,)
Client 0: Feature Index 0-1096
Client 1: Feature Index 1097-2192
Server : Feature Index 2193-3288


fnn

In [18]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [19]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
25,0.911087,0.980469,0.976864,0.976864
26,0.849706,0.982422,0.979434,0.979434
27,0.785553,0.984375,0.982005,0.982005
28,0.719302,0.986328,0.987147,0.987147
29,0.651973,0.990234,0.989717,0.989717


stg

In [62]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu', lam=0.1)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [63]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
25,0.13124,0.998047,0.989717,0.997429,2450
26,0.142057,0.996241,0.992288,0.989717,2338
27,0.137647,0.996094,0.984576,0.994859,2267
28,0.139888,0.998047,0.984576,0.984576,2215
29,0.208198,0.980909,0.989717,0.997429,2162


STG with GINI Initialization

In [64]:
%%capture
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu', lam=0.1, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [65]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
25,0.247735,0.990528,0.992288,0.994859,2029
26,0.128277,0.998047,0.987147,1.0,2024
27,0.136712,0.996094,0.976864,0.994859,2016
28,0.12449,0.998047,0.997429,0.992288,2003
29,0.167877,0.990234,0.992288,1.0,1998


Dual STG with GINI Initialization

In [68]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=4,
    output_dim=output_dim,
    hidden_dims=[16, 8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=30,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [69]:
dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
25,0.358878,0.972803,0.984576,0.997429,2019,12
26,0.251999,1.0,0.969152,0.979434,2015,12
27,0.298798,0.992188,0.997429,0.989717,2013,12
28,0.276605,0.996094,0.984576,0.974293,2003,12
29,0.430902,0.977444,0.997429,0.994859,2000,12


dual stg with longer training

In [74]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=4,
    output_dim=output_dim,
    hidden_dims=[16, 8],
    activation="relu",
    mus=mus, top_lam=0.1, lam=0.1)
longer_dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=60,
 
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [75]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
55,0.34142,0.988722,1.0,0.989717,1521,9
56,0.287515,0.990234,0.994859,0.994859,1509,9
57,0.24225,0.996241,0.984576,0.992288,1501,9
58,0.257578,0.990381,0.992288,0.997429,1493,8
59,0.23561,1.0,0.989717,1.0,1484,8


SFFS Filtered (0.5)

In [76]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 9.41080641746521


  f_statistics[j] = theta_param[j] ** 2 / diag_x[j]


In [77]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(1943, 1644)


In [78]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.2)
train_loader = DataLoader(dataset.train(), batch_size=256, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-547
Client 1: Feature Index 548-1095
Server : Feature Index 1096-1643


In [99]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=4,
                            output_dim=output_dim, hidden_dims=[16, 8],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=30,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [100]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
25,1.253918,0.697133,0.727506,0.727506
26,1.233452,0.72433,0.760925,0.760925
27,1.215437,0.753334,0.799486,0.799486
28,1.186896,0.811824,0.838046,0.838046
29,1.159974,0.839168,0.856041,0.856041


---
# Summary

| Model                 | # Features | Test Acc | Ratio Embedding |
|-----------------------|------------|----------|-----------------|
| FNN                   | 3289       | 0.9897   | 1               |
| STG                   | 2162       | 0.9974   | 1               |
| STG+GINI              | 1998       | 1        | 1               |
| DualSTG+GINI          | 2000       | 0.9948   | 1               |
| DualSTG+GINI (double) | 1484       | 1        | 0.667           |
| SFFS->FNN             | 1645       | 0.8560   | 1               |

In [101]:
import dill
dill.dump_session('PCMACDataExperiments.db')