# Feature Selector using Stochastic Gates

In [1]:
import torch
import numpy as np

In [2]:
from torch import nn
from torch.nn import Parameter
import torch.nn.functional as F

class FeatureSelectorSG(nn.Module):
    def __init__(self, input_size, sigma=.5):
        super(FeatureSelectorSG, self).__init__()
        self.input_size = input_size
        self.sigma = sigma
        self.mu = Parameter(1e-2 * torch.randn(input_size))

    def forward(self, x):
        eps = torch.normal(0, torch.ones_like(self.mu))
        z = self.mu + (self.sigma * eps * self.training)
        gate = F.hardtanh(z, 0, 1)
        return gate * x


    def guassian_cdf(self, x:torch.Tensor) -> torch.Tensor:
        r''' 
            Guassian CDF
            
            Based on: https://stackoverflow.com/questions/809362/how-to-calculate-cumulative-normal-distribution
        '''
        return 0.5 * (1 + torch.erf(x / np.sqrt(2)))

    def regularize(self):
        r'''
            The expected regularization is the is the sum of the probabilities 
            that the gates are are active
        '''

        return torch.sum(self.guassian_cdf(self.mu / self.sigma))

# Experiment

In [3]:
from sklearn import datasets
from torch.utils.data import Dataset, DataLoader

class BreastCancer(Dataset):
    r'''
        Breast Cancer Wisconsin Dataset
    '''
    def __init__(self, normalize=False):
        dataset = datasets.load_breast_cancer()
        self.data = torch.tensor(dataset.data).float()
        self.targets = torch.tensor(dataset.target)
    
        if normalize:
            self.data /= torch.max(self.data, dim=0)[0]

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

    def __len__(self):
        return len(self.data)

In [4]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        self.model = nn.Sequential(
                FeatureSelectorSG(input_size),
                nn.Linear(input_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, hidden_size//2),
                nn.ReLU(),
                nn.Linear(hidden_size//2, output_size)
        )

    def forward(self, x):
        return self.model(x)

    def regularization(self, reg_factor = 1e-3):
        reg = 0.
        for module in self.modules():
            if isinstance(module, FeatureSelectorSG):
                reg = reg  + (reg_factor * module.regularize())
        return reg


In [5]:
from tqdm import tqdm

def train(model, dataset, batch_size = 128, n_epochs=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    criterion = nn.CrossEntropyLoss()
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    reg_factor = 1e-6

    epoch_iterator = tqdm(
            range(n_epochs),
            leave=True,
            unit="epoch",
            postfix={"tls": "%.4f" % 1},
        )

    for _ in epoch_iterator:
        reg_factor = max(reg_factor + 1e-4, 1e-2)
        for idx, (inputs, targets) in enumerate(loader):
            optimizer.zero_grad()

            inputs = inputs.to(device)
            targets = targets.to(device)
            pred = model(inputs)

            loss = criterion(pred, targets) + model.regularization()
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                epoch_iterator.set_postfix(tls="%.4f" % loss.item())

    return model

In [6]:
dataset = BreastCancer(normalize=True)

eval_len = len(dataset) // 5 # 20% of the dataset
train_set, eval_set = torch.utils.data.random_split(dataset, [len(dataset) - eval_len, eval_len])

loader = DataLoader(eval_set, batch_size=128, shuffle=True)

In [7]:
model = Model(30, 48, 2)
train(model, train_set, batch_size=128, n_epochs=400)

100%|██████████| 400/400 [00:13<00:00, 29.12epoch/s, tls=0.1277]


Model(
  (model): Sequential(
    (0): FeatureSelectorSG()
    (1): Linear(in_features=30, out_features=48, bias=True)
    (2): ReLU()
    (3): Linear(in_features=48, out_features=24, bias=True)
    (4): ReLU()
    (5): Linear(in_features=24, out_features=2, bias=True)
  )
)

In [8]:
feature_selector = model.model[0]

In [9]:
model.eval()
x, y = next(iter(loader))
selected = feature_selector(x.cuda()).cpu()

In [10]:
values, idx = torch.sort(feature_selector.mu)
features_names = datasets.load_breast_cancer(as_frame=True).data.columns[idx.flipud().cpu()]
print(features_names)

['worst area' 'area error' 'worst concave points' 'mean concavity'
 'mean concave points' 'worst concavity' 'fractal dimension error'
 'mean area' 'radius error' 'worst texture' 'smoothness error'
 'worst compactness' 'perimeter error' 'worst perimeter'
 'compactness error' 'symmetry error' 'worst radius' 'worst symmetry'
 'mean perimeter' 'mean fractal dimension' 'mean radius'
 'worst fractal dimension' 'mean texture' 'mean symmetry'
 'mean compactness' 'worst smoothness' 'concavity error'
 'concave points error' 'texture error' 'mean smoothness']


In [11]:
selected[0] == x[0]

tensor([False, False, False, False, False, False,  True,  True, False, False,
        False, False, False,  True, False, False, False, False, False, False,
        False, False, False,  True, False, False, False,  True, False, False])

In [12]:
feature_selector.mu

Parameter containing:
tensor([-0.5233, -0.5821, -0.4947,  0.3475, -0.8857, -0.6511,  1.6401,  1.1204,
        -0.6439, -0.5060,  0.0411, -0.8130, -0.2006,  1.9670, -0.1568, -0.2627,
        -0.6603, -0.7474, -0.2658,  0.7959, -0.3885, -0.0355, -0.2391,  2.2212,
        -0.6538, -0.1581,  0.9552,  1.6623, -0.3969, -0.5761], device='cuda:0',
       requires_grad=True)