## DAE:

The Denoising Autoencoder, is basically reconstruction of corrupted data.
    
    
Autoencoders are Neural Networks which are commonly used for feature selection and extraction. However, when there are more nodes in the hidden layer than there are inputs, the Network is risking to learn the so-called “Identity Function”, also called “Null Function”, meaning that the output equals the input, marking the Autoencoder useless.

Denoising Autoencoders solve this problem by corrupting the data on purpose by randomly turning some of the input values to zero. In general, the percentage of input nodes which are being set to zero is about 50%. Other sources suggest a lower count, such as 30%. It depends on the amount of data and input nodes you have.


When calculating the Loss function, it is important to compare the output values with the original input, not with the corrupted input. That way, the risk of learning the identity function instead of extracting features is eliminated.

VVIMP
https://reyhaneaskari.github.io/AE.htm

In [1]:
import pandas as pd
import numpy as np

import os

import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import transforms

In [2]:
train_file = '../data/train.csv'

df_train = pd.read_csv(train_file)

In [3]:
# df_train[df_train.columns[2:]].head()
# list(df_train.iloc[5][2:])
df_train['target'].iloc[5]

0

In [4]:
data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0],
                         std=[1])
])

In [5]:
class Dataset(Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, df, transform=None):
        'Initialization'
        self.df = df
        self.transform = transform

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.df)

    def __getitem__(self, index):
        'Generates one sample of data'

        # Load data and get label
        X = list(self.df.iloc[index][2:].values)
        y = self.df['target'].iloc[index]
        
        
        sample = {'data': X, 'label': y}

        
        if self.transform:
            sample['data'] = self.transform(torch.stack(X))

        return sample

In [3]:
num_epochs = 20
batch_size = 256
learning_rate = 1e-3

In [4]:
x = Variable(torch.from_numpy(df_train[df_train.columns[2:]].values))
y = Variable(torch.from_numpy(df_train['target'].values))

In [5]:
# dataset = Dataset(df_train)
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [15]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
#                                         nn.Linear(200, 128),
#                                         nn.ReLU(True),
                                        nn.Linear(200, 64),
                                        nn.ReLU(True),
                                        nn.Linear(64, 16),
                                        nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
                                        nn.Linear(16, 64),
                                        nn.ReLU(True),
                                        nn.Linear(64, 200),
#                                         nn.ReLU(True),
#                                         nn.Linear(128, 200),
                                        nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [16]:
def add_noise(data):
    noise = torch.randn(data.size()[0], 200) * 0.2
    noisy_data = data + noise
    return noisy_data

In [17]:
model = autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(num_epochs):
    for data, _ in dataloader:
        data = data.float()
        noisy_data = add_noise(data)
        noisy_data = Variable(noisy_data)
        data = Variable(data)
        
        
        # ===================forward=====================
        output = model(noisy_data)
        loss = criterion(output, data)
        
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # ===================log========================
    print('epoch [{}/{}], MSE_loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.item()))
    
# torch.save(model.state_dict(), './sim_autoencoder.pth')

epoch [1/20], MSE_loss:124.2227
epoch [2/20], MSE_loss:124.6318
epoch [3/20], MSE_loss:124.3436
epoch [4/20], MSE_loss:125.4180
epoch [5/20], MSE_loss:126.5876
epoch [6/20], MSE_loss:126.0823
epoch [7/20], MSE_loss:126.6260
epoch [8/20], MSE_loss:123.6308
epoch [9/20], MSE_loss:126.4224
epoch [10/20], MSE_loss:123.7523
epoch [11/20], MSE_loss:121.6230
epoch [12/20], MSE_loss:124.0993
epoch [13/20], MSE_loss:124.2260
epoch [14/20], MSE_loss:125.7262
epoch [15/20], MSE_loss:127.0370
epoch [16/20], MSE_loss:125.3770
epoch [17/20], MSE_loss:125.5110
epoch [18/20], MSE_loss:124.0258
epoch [19/20], MSE_loss:122.7660
epoch [20/20], MSE_loss:124.7041


In [None]:
## hmmm! its not learning anything!

In [14]:
output[0]

tensor([1.0000e+00, 8.3929e-19, 1.0000e+00, 1.0000e+00, 1.0000e+00, 6.0714e-20,
        1.0000e+00, 1.0000e+00, 2.8300e-05, 1.0000e+00, 1.4002e-02, 3.3214e-21,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.4991e-19,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 6.2313e-21, 4.4062e-17, 1.0000e+00, 1.0000e+00,
        9.6204e-21, 1.0000e+00, 1.2030e-16, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 4.5482e-21, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 5.7222e-21, 1.0000e+00, 3.2445e-22,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.9982e-17, 1.0000e+00,
        1.1110e-09, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 2.8443e-22, 9.9709e-01, 9.9845e-01, 1.0000e+00, 1.0000e+00,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.4602e-20, 1.0000e+00, 6.9374e-01,
        1.0000e+00, 1.0000e+00, 1.0000e+