In [2]:
# import PyTorch
import torch

# standard DS stack
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
# embed static images in the ipynb
%matplotlib inline 

# neural network package
import torch.nn as nn 
import torch.nn.functional as F

# computer vision
import torchvision
from torchvision import transforms
from PIL import Image

# dataset loading
from torch.utils.data import Dataset, DataLoader

# convenient package for plotting loss functions
# from livelossplot import PlotLosses

import copy
import importlib.util # to run outside module
from sklearn.model_selection import train_test_split

In [8]:
# Retrieve preprocessed titanic dataset
system_path = r"C:\Users\uniqu\Adaptation\github repos" \
              + "\Bioinformatics-Neural Networks for Genomic Risk"

exec(open(system_path+"\preprocess_titanic.py").read())

X, Y = preprocess_titanic()

# Perform train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.3, random_state=7)

In [29]:
# Real dataset
real_data = np.hstack([X, Y])
real_data.shape
print(f"feature names:\n  {list(titanic_data.columns[1:])}\n")
print(f"target names: '{titanic_data.columns[0]}'")
X[0], Y[0]

feature names:
  ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alone']

target names: 'survived'


(array([ 3.  ,  1.  , 22.  ,  1.  ,  0.  ,  7.25,  0.  ,  0.  ,  1.  ,
         0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,
         0.  ,  0.  ,  1.  ]),
 array([0.]))

## CTGAN

[TGAN](https://github.com/sdv-dev/TGAN) seemed like the exact tool I was looking for. It turns out [CTGAN](https://github.com/sdv-dev/CTGAN) is better. Several major differences make CTGAN outperform TGAN.
- **Preprocessing**: CTGAN uses more sophisticated Variational Gaussian Mixture Model to detect modes of continuous columns.
- **Network structure**: TGAN uses LSTM to generate synthetic data column by column. CTGAN uses Fully-connected networks which is more efficient.
- **Features to prevent** mode collapse: We design a conditional generator and resample the training data to prevent model collapse on discrete columns. We use WGANGP and PacGAN to stabilize the training of GAN.

In [35]:
# !pip install ctgan

In [34]:
from ctgan import load_demo
data = load_demo()

from ctgan import CTGANSynthesizer
ctgan = CTGANSynthesizer()

ModuleNotFoundError: No module named 'ctgan'

In [16]:
# GPU for cloud training
if torch.cuda.is_available(): 
    device = torch.device("cuda") # device = GPU
else:
    device = torch.device("cpu") # device = CPU
    
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.fc1 = nn.Linear(n_features, H_0)
        self.fc2 = nn.Linear(H_0, H_1)
        self.fc3 = nn.Linear(H_1, D_out)
        
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, z):
        z = F.leaky_relu(self.fc1(z))
        z = F.leaky_relu(self.fc2(z)) 
        z = F.leaky_relu(self.fc3(z))
        return z

    
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(n_features, H_0)
        self.fc2 = nn.Linear(H_0, H_1)
        self.fc3 = nn.Linear(H_1, D_out)
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
#          x = self.dropout()
        x = F.leaky_relu(self.fc2(x)) 
        x = F.leaky_relu(self.fc3(x))
        return x

class Toy_Dataset(Dataset): # inherit from torch's Dataset class.
    def __init__(self, train):
        # data loading
        if train == True:
            self.X = torch.from_numpy(X_train.astype(np.float32))
            self.Y = torch.from_numpy(Y_train.astype(np.float32))
        else:
            self.X = torch.from_numpy(X_test.astype(np.float32))
            self.Y = torch.from_numpy(Y_test.astype(np.float32))

        if self.X.shape[0] == self.Y.shape[0]:
            self.n_samples = self.X.shape[0]
        else:
            raise ValueError("Shape mismatch")
        
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
    
    def __len__(self):
        return self.n_samples
        # len(dataset)

# Initialize constants
n_features = X_train.shape[1]
BATCH_SIZE, D_in, D_out = 15, n_features, 1
H_0, H_1 = int(0.7*n_features), int(0.4*n_features)

# Set DataLoaders    
train_set = Toy_Dataset(train=True)
test_set = Toy_Dataset(train=False)
train_dl = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True)

# Initialize networks
G = Generator()
D = Discriminator()

loss_fn = nn.BCEWithLogitsLoss()

def check_models():
    x = torch.from_numpy(X_train[0]).float() 
    print( G(x) )
    print( D(x) )
    
check_models()

tensor([-0.0076], grad_fn=<LeakyReluBackward0>)
tensor([-0.0026], grad_fn=<LeakyReluBackward0>)


In [4]:
def train():
    pass

## References

Conceptual References:
- [*GANS*, Google Developers](https://developers.google.com/machine-learning/gan/gan_structure)
- [Ian Goodfellow on Lex Fridman podcast](https://www.youtube.com/watch?v=Z6rxFNMGdn0&t=2826s&ab_channel=LexFridman)
- [Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., ... & Bengio, Y. (2014). Generative adversarial nets. In *Advances in neural information processing systems* (pp. 2672-2680).](https://arxiv.org/pdf/1406.2661.pdf)
- [Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., & Chen, X. (2016). Improved techniques for training gans. In *Advances in neural information processing systems* (pp. 2234-2242).](https://arxiv.org/pdf/1606.03498.pdf)
- [DeepInsight: A methodology to transform a non-image data to an image for convolution neural network architecture](https://www.nature.com/articles/s41598-019-47765-6)

Implementation References:
- [eriklindernoren/PyTorch-GAN](https://github.com/eriklindernoren/PyTorch-GAN/blob/master/implementations/gan/gan.py)
  - [list of successful GAN architectures](https://github.com/eriklindernoren/PyTorch-GAN#gan)
- [fast ai, free GPU w/ Google Colab guide](https://www.kdnuggets.com/2018/02/fast-ai-lesson-1-google-colab-free-gpu.html)
- [PyTorch GAN, github.com/devnag](https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py)
- [Brownlee, Jason (2020). How to Code the GAN Training Algorithm. *machinelearningmastery.com*](https://machinelearningmastery.com/how-to-code-the-generative-adversarial-network-training-algorithm-and-loss-functions/)
- TGAN: [paper](https://arxiv.org/pdf/1811.11264.pdf), [repo](https://github.com/sdv-dev/TGAN)
- CTGAN: [repo](https://github.com/sdv-dev/CTGAN)

'1.19.1'