<h1><center>Sam Armstrong CS548 Project</center></h1>
<h1><center>Variational Autoencoder with Classifier</center></h1>

## Import Libraries

In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader

## Load RNA-Seq Data

In [2]:
seq_data = pd.read_csv("tumor_data/data.csv")
seq_data.rename(columns={'Unnamed: 0':'sample'}, inplace=True)

## Load Tumor Type/Class Data

In [3]:
labels = pd.read_csv("tumor_data/labels.csv")
labels.rename(columns={'Unnamed: 0':'sample'}, inplace=True)

## Join RNA-Seq and Tumor Type/Class Data

In [4]:
tumor_data = seq_data.set_index('sample').join(labels.set_index('sample')).to_numpy()

## Custom Pytorch Dataset for Cancer Data

In [5]:
class TumorDataset(Dataset):
    
    def __init__(self, data, transform=None):
        'Initialization'
        self.labels = data[:, -1]
        self.lookup_table, self.labels = np.unique(data[:, -1], return_inverse=True)
        self.genes = data[:, :-1]
        self.transform = transform

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = torch.from_numpy(self.genes[index, : ].astype(np.float)).float()
        y = torch.from_numpy(np.unique(self.labels[index]).astype(int))

        return X, y

## Encoder Class

In [6]:
class Encoder(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(Encoder, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        return F.relu(self.linear2(x))

## Decoder Class

In [7]:
class Decoder(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(Decoder, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        return F.relu(self.linear2(x))

## Classifier Class

In [8]:
class Classifier(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(Classifier, self).__init__()
        self.hidden = []
        self.hidden.append(torch.nn.Linear(D_in, H[0]))
        
        for i in range(1,len(H)):
            self.hidden.append(torch.nn.Linear(H[i-1], H[i]))
            
        self.output = torch.nn.Linear(H[-1], D_out)
       
  
    def forward(self, x):
        for h in self.hidden:
            x = h(x)
       
        return self.output(x)

## Variational Autoencoder Class with Classifier

In [9]:
class VAE(torch.nn.Module):

    def __init__(self, encoder, decoder, classifier, latent_dim):
        super(VAE, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.classifier = classifier
        self._enc_mu = torch.nn.Linear(100, 8)
        self._enc_log_sigma = torch.nn.Linear(100, 8)

    def _sample_latent(self, h_enc):
        """
        Return the latent normal sample z ~ N(mu, sigma^2)
        """
        mu = self._enc_mu(h_enc)
        log_sigma = self._enc_log_sigma(h_enc)
        sigma = torch.exp(log_sigma)
        std_z = torch.from_numpy(np.random.normal(0, 1, size=sigma.size())).float()

        self.z_mean = mu
        self.z_sigma = sigma

        return mu + sigma * Variable(std_z, requires_grad=False)  
    
    
    def forwardEncoder(self, state):
        h_enc = self.encoder(state)
        return self._sample_latent(h_enc)
    
    def forwardAutoEncoder(self, state):
        z = self.forwardEncoder(state)
        return self.decoder(z)
    
    def forwardClassifier(self, state):
        z = self.forwardEncoder(state)
        return self.classifier(z)
    
    def saveModel(self, fileName="vaeModel"):
        torch.save(self, fileName)
        
    def loadModel(self, fileName="vaeModel"):
        self = torch.load(fileName)

## Custom Loss Method (Kullback–Leibler Loss)

In [10]:
def latent_loss(z_mean, z_stddev):
    mean_sq = z_mean * z_mean
    stddev_sq = z_stddev * z_stddev
    return 0.5 * torch.mean(mean_sq + stddev_sq - torch.log(stddev_sq) - 1)

## Train AutoEncoder (Encoder + Decoder)

In [13]:
input_dim = 20531
latent_dim = 8
classes = 5
batch_size = 32
epochs = 10

encoder = Encoder(input_dim, 100, 100)
decoder = Decoder(latent_dim, 100, input_dim)
classifier = Classifier(latent_dim, [20, 20, 20], classes)

vae = VAE(encoder, decoder, classifier, latent_dim)

train_loader = torch.utils.data.DataLoader( TumorDataset(tumor_data), batch_size=batch_size, shuffle=True)

print('Number of samples: ', len(tumor_data))

criterion = torch.nn.MSELoss()

optimizer = optim.Adam(vae.parameters(), lr=0.0001)
l = None
for epoch in range(epochs):
    for i, data in enumerate(train_loader, 0):
        inputs, classes = data
        inputs, classes = Variable(inputs.resize_(batch_size, input_dim)), Variable(classes)
        optimizer.zero_grad()
        dec = vae.forwardAutoEncoder(inputs.float())
        ll = latent_loss(vae.z_mean, vae.z_sigma)
        loss = criterion(dec, inputs) + ll
        loss.backward()
        optimizer.step()
        l = loss.item()
    print(epoch, l)
# vae.saveModel()

Number of samples:  801
0 1.7737231254577637


KeyboardInterrupt: 

## Train Classifier (Encoder + Classifier)

In [11]:
input_dim = 20531
latent_dim = 8
n_classes = 5
batch_size = 32
epochs = 100

encoder = Encoder(input_dim, 100, 100)
decoder = Decoder(latent_dim, 100, input_dim)
classifier = Classifier(latent_dim, [20, 20, 20], n_classes)

vae = VAE(encoder, decoder, classifier, latent_dim)

train_loader = torch.utils.data.DataLoader( TumorDataset(tumor_data), batch_size=batch_size, shuffle=True, drop_last=True)

print('Number of samples: ', len(tumor_data))

criterion = torch.nn.CrossEntropyLoss()

optimizer = optim.Adam(vae.parameters(), lr=0.0001)
l = None
for epoch in range(epochs):
    for i, data in enumerate(train_loader, 0):
        inputs, classes = data
        inputs, classes = Variable(inputs.resize_(batch_size, input_dim)), Variable(classes).flatten().long()
        predictions = vae.forwardClassifier(inputs.float())
        optimizer.zero_grad()
        ll = latent_loss(vae.z_mean, vae.z_sigma)
        loss = criterion(predictions, classes) + ll
        loss.backward()
        optimizer.step()
        l = loss.item()
    print(epoch, l)
# vae.saveModel()

Number of samples:  801
0 2.3076910972595215
1 1.6488484144210815
2 1.6345595121383667
3 1.5680423974990845


KeyboardInterrupt: 

## Get Classifier Percentage Correct on Train Data

In [12]:
correct = 0
total = 0
for i, data in enumerate(train_loader, 0):
    inputs, classes = data
    inputs, classes = Variable(inputs.resize_(batch_size, input_dim)), Variable(classes).flatten().long()
    predictions = vae.forwardClassifier(inputs.float())
    _, predicted = torch.max(predictions.data, 1)
    total += classes.size(0)
    correct += (predicted == classes).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 37 %
