# Classifying Tumor vs. Normal from Gene Expression

See if it's possible to train a deep neural network tumor/normal binary classifier using just the Toil TCGA, TARGET and GTEX expression datasets:

https://xenabrowser.net/datapages/?host=https://toil.xenahubs.net)

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

# fix random seed for reproducibility
np.random.seed(42)

In [3]:
"""
Detect if there are GPU's on this box and if so use them politely. If not use the CPU.
"""
import re
import subprocess
import keras.backend.tensorflow_backend


if "CUDA_HOME" in os.environ:
    # Pick a GPU that isn't being used and then only use the memory we need (allow_growth)
    print("Using GPU")
    def get_session():
        return tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True,
                                                gpu_options=tf.GPUOptions(allow_growth=True)))
    keras.backend.tensorflow_backend.set_session(get_session())

    utilization = re.findall(r"Utilization.*?Gpu.*?(\d+).*?Memory.*?(\d+)",
                             subprocess.check_output(["nvidia-smi", "-q"]), 
                             flags=re.MULTILINE | re.DOTALL)
    print("GPU Utilization", utilization)

    if ('0', '0') in utilization:
        print("Using GPU Device:", utilization.index(('0', '0')))
        os.environ["CUDA_VISIBLE_DEVICES"] = str(utilization.index(('0', '0')))
        os.environ["CUDA_DEVICE_ORDER"]  = "PCI_BUS_ID"  # To ensure the index matches
    else:
        print("All GPUs in Use")
        exit    
else:
    print("Using CPU")

Using CPU


In [4]:
from keras.utils.io_utils import HDF5Matrix
import h5py

input_file = "data/tumor_normal.h5"

with h5py.File(input_file, "r") as f:
    print("Datasets:", list(f.keys()))
    
# Use a subset of the data if debugging
if os.getenv("DEBUG", "True") == "True":
    X_train = HDF5Matrix(input_file, "X_train", start=0, end=1000)
    X_test = HDF5Matrix(input_file, "X_test", start=0, end=200)
    y_train = HDF5Matrix(input_file, "y_train", start=0, end=1000)
    y_test = HDF5Matrix(input_file, "y_test", start=0, end=200)
    print("Training on partial dataset")
    epochs=1
    batch_size=256
else:
    X_train = HDF5Matrix(input_file, "X_train")
    X_test = HDF5Matrix(input_file, "X_test")
    y_train = HDF5Matrix(input_file, "y_train")
    y_test = HDF5Matrix(input_file, "y_test")
    print("Training on full dataset")
    epochs=8
    batch_size=128
    
print("X_train.shape:", X_train.shape, "epochs:", epochs, "batch_size:", batch_size)

Datasets: ['X_test', 'X_train', 'class_labels', 'classes_test', 'classes_train', 'features', 'labels', 'y_test', 'y_train']
Training on partial dataset
X_train.shape: (1000, 58581) epochs: 1 batch_size: 256


In [5]:
"""
Batch normalization with a sparse layer.

Achieves on Test:
3744/3826 [============================>.] - ETA: 0s(['loss', 'acc'], [0.52888519396363787, 0.89388395178336721])
"""
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, BatchNormalization, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers

classify = [
    InputLayer(input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    
    Dense(1000),
    BatchNormalization(),
    Activation('relu'),

    Dense(500, activity_regularizer=regularizers.l1(1e-5)),
    BatchNormalization(),
    Activation('relu'),
    
    Dense(1),
    Activation('sigmoid')
]

model = Sequential(classify)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks=[EarlyStopping(monitor='acc', min_delta=0.05, patience=2, verbose=2, mode="max")]

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, shuffle="batch", callbacks=callbacks)

print(model.metrics_names, model.evaluate(X_test, y_test))

Epoch 1/1
['loss', 'acc'] [0.40502985000610353, 0.87]


In [6]:
"""
Save model and weights so we can copy them back from the GPU machine
to visualize and evaluate locally.
"""
model_json = model.to_json()
if not os.path.exists("models"):
    os.makedirs("models")
with open("models/model.json", "w") as f:
    f.write(model.to_json())

model.save_weights("models/weights.h5")
print("Saved to model.json and weights.json")

Saved to model.json and weights.json
