In [125]:
# Imports

import tensorflow as tf
import numpy as np
import pandas as pd
from itertools import *
import sklearn
import math
import random
import sys
import multiprocessing
nproc = max(1, multiprocessing.cpu_count() - 1)

if 'utils' not in sys.path:
    sys.path.append('utils')

import data_loader

# Warnings

import warnings
warnings.filterwarnings('ignore')

# Idempotent, cached data retrieval script

print(data_loader.load_chromosome.__doc__)
train_df, test_df, train_ix, test_ix, train_tissues, tfs = \
    data_loader.load_chromosome_cached('1')
    
if not sess:
    sess = tf.InteractiveSession()

Idempotent data loading. For a given chromosome n (a string).
    
    Returns (train_df, test_df, train_ix, test_ix, train_tissues, tfs)
    
    The first two are the train and test dataframes, and test_ix are the
    values in test_df['assayed'] that are missing and need to be imputed (with the
    correct answer being in test_df['filled'] in the corresponding locations.
    train_ix are the assayed (known) methylation values from limited microarray
    sampling (e.g., test_df['assayed'].iloc[train_ix] can be used for prediction of
    test_df['filled'].iloc[test_ix], and the former should be about equal to
    test_df['filled'].iloc[train_ix] (two different ways of sampling methylation).
    
    Imports genetic context and adds those columns to the parameter df, returning
    a merged one. tfs is the list of names of new transcription
    factors.
    
    train_tissues is a list of the names of columns with chromosome methylation values.
    
    Note that loading from scratch ma

In [95]:
# Perhaps there are obvious sequence trends?
def local_impute(data):
    #http://stackoverflow.com/questions/9537543/replace-nans-in-numpy-array-with-closest-non-nan-value
    mask = np.isnan(data)
    data[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), data[~mask])
    return data

# Do mean imputation on our training data.
def mean_impute(data):
    mask = np.isnan(data)
    data[mask] = float(data.mean()) # just = m messes with serialization
    return data
train_df_imp = train_df
train_df_int = train_df
for i in train_tissues:
    train_df_imp[i] = mean_impute(train_df[i].copy())
    train_df_int[i] = local_impute(train_df[i].copy())
print('nans in mean-imputed', np.isnan(train_df_imp[train_tissues]).sum().sum())
print('nans in interpolated', np.isnan(train_df_int[train_tissues]).sum().sum())

nans in mean-imputed 0
nans in interpolated 0


In [96]:
## GMM Stuff
np.random.seed(0)
rc = np.random.choice(train_ix, p, replace=False)

In [115]:
# Copied pretty much directly from sklearn

# Simultaneous K-cluster likelihood computation.
# X is Nxp, mus is Kxp, sigmas is Kxp
# Output is KxN likelihoods for each sample in each cluster.
def tf_log_normals(X, mus, sigmas):
    # p(X) = sqrt(a * b * c)
    # a = (2 pi)^(-p)
    # b = det(sigma)^(-1)
    # c = exp(-(x - mu)^T sigma^(-1) (x - mu)) [expanded for numerical stability]
    #
    # Below we make simplifications since sigma is diag
    
    p = tf.squeeze(tf.slice(tf.shape(mus), tf.pack([tf.rank(mus) - 1]), [1])) 
    XT = tf.transpose(X) # pxN
    invsig = tf.inv(sigmas)
    
    loga = -tf.cast(p, 'float64') * tf.log(tf.constant(2 * np.pi, dtype='float64')) # scalar
    logb = tf.reduce_sum(tf.log(invsig), 1, keep_dims=True) # Kx1
    logc =  \
        - tf.reduce_sum(invsig * tf.square(mus), 1, keep_dims=True) \
        + 2 * tf.matmul(invsig * mus, XT) \
        - tf.matmul(invsig, tf.square(XT)) # KxN
    return 0.5 * (loga + logb + logc)

In [128]:
N = 100
D = 1000

X = np.random.normal(size=(N, D))
mu = X.mean(axis=0)
sigma = X.std(axis=0)
mus = np.array([mu, mu, mu * 2])
sigmas = np.array([sigma, sigma * 2, sigma])

ll = tf_log_normals(*(tf.constant(x) for x in (X, mus, sigmas))).eval()
for i, (mu, sigma) in enumerate(zip(mus, sigmas)):
    actual = D * np.log(2 * np.pi) + np.log(np.prod(sigma))
    actual += np.sum((X - mu) ** 2 / sigma, axis=1)
    actual *= -0.5
    print('K=1 rmse', math.sqrt(sklearn.metrics.mean_squared_error(actual, ll[i])))

K=1 rmse 1.639614891002561e-13
K=1 rmse 1.016845989170083e-13
K=1 rmse 1.8189894035458566e-13


In [None]:
# Scratch work below

In [89]:
def tf_l1normalize_rows(x):
    s = tf.reduce_sum(x, 1, keep_dims=True)
    return x / s

def tf_normal(X, mu, sigma, p):
    shifted = X - mu
    sq = tf.square(shifted)
    det = tf.reduce_prod(sigma)
    sigma = tf.expand_dims(sigma, 1)
    return tf.exp(-tf.matmul(sq, tf.inv(sigma)) / 2) * tf.rsqrt(det) * (2 * np.pi) ** (-p /2)
# if it provides any comfort, tf_normal ~= tf.exp(tf_log_normal)

def tf_matrix_row(X, n, p): # X is a matrix of rows length p, gets n-th row
    return tf.squeeze(tf.slice(X, [n, 0], [1, p]))

# X is Nxp, mus is Kxp, sigmas Kxp
# output is NxK responsibilities
def responsibilities(X, mus, sigmas, alphas, K, p):
    all_norms = []
    for i in range(K):
        mu = tf_matrix_row(mus, i, p)
        sigma = tf_matrix_row(sigmas, i, p)
        all_norms.append(tf_normal_scaled(X, mu, sigma, p))
    unnorm = tf.concat(1, all_norms) # NxK unnormalized responsibility
    return tf_l1normalize_rows(unnorm * alphas)

# all implementation basically copied from scipy gmm
# Stably log-sum-exps likelihood along rows.
# Reduces NxK tensor L to Nx1 tensor
def tf_log_sum_exp(L):
    maxs = tf.reduce_max(L, 1, keep_dims=True)
    return tf.expand_dims(tf.log(tf.reduce_sum(tf.exp(L - maxs), 1)), 1) + maxs

# X is Nxp, mus is Kxp, sigmas Kxp
# output is log probability, NxK responsibilities
def estep(X, mus, sigmas, alphas, K, p):
    log_likelihood = []
    for i in range(K):
        mu = tf_matrix_row(mus, i, p)
        sigma = tf_matrix_row(sigmas, i, p)
        log_likelihood.append(tf_log_normal(X, mu, sigma, p))
    log_likelihood = tf.concat(1, log_likelihood) + tf.log(alphas) # NxK likelihoods
    sample_log_prob = tf_log_sum_exp(log_likelihood)
    return tf.reduce_mean(sample_log_prob), tf.exp(log_likelihood - sample_log_prob)

EPS = np.finfo(float).eps
MIN_COVAR = 1e-3
def mstep(X, mus, sigmas, alphas, resp):
    weights = tf.reduce_sum(resp, 0)
    invweights = tf.expand_dims(tf.inv(weights + 10 * EPS), 1) # Kx1
    new_alphas = EPS + weights / (tf.reduce_sum(weights) + 10 * EPS)
    respT = tf.transpose(resp)
    weighted_cluster_sum = tf.matmul(respT, X) # Kxp 
    new_mus = weighted_cluster_sum * invweights
    avg_X2 = tf.matmul(respT, tf.square(X)) * invweights
    avg_mu2 = tf.square(mus)
    avg_X_mu = mus * weighted_cluster_sum * invweights
    new_sigma = avg_X2 - 2 * avg_X_mu + avg_mu2 + MIN_COVAR
    # (x - mu) (x-mu)^T for banded. 
    return new_alphas, new_mus, new_sigma

In [90]:
tf.exp(tf_log_normals(a, tf.pack([b, a1, b / 2, a2]), tf.pack([c, c, c / 2, c]))).eval()

array([[  4.19773441e-03,   4.19773255e-03],
       [  1.88129321e-02,   4.66325728e-05],
       [  3.92013006e-02,   3.25969829e-08],
       [  4.66326164e-05,   1.88129321e-02]], dtype=float32)

In [None]:
N = 10
p = 20 # D is better?
K = 3
k = 0 # maximum band offset (0 is just diagonal)
tissues_to_cluster = train_tissues
X_np = train_df_imp[train_tissues].values.transpose()
X_trunc = X_np[:N, rc].astype('float32')

In [35]:
## Implement custom GMM
#todo check float32/float64 conversion for speedup
#todo scale (standardize) data before running
#todo kmeans warmup (just in numpy?)
#todo mincovar

X = tf.Variable(X_trunc, 'X') #tf.placeholder('float', shape=[N, p])

alpha0 = tf.ones([K], dtype='float') / K
alphas = tf.Variable(alpha0, 'alphas')

mu0 = X_trunc[np.random.choice(N, K, replace=False)]
mus = tf.Variable(mu0, 'mus')

# initialize the off-band with covariances
# currently just diagonal
# once random init
def tf_repeat(x, n):
    return tf.pack([x for i in range(n)]) # tf.tile instead. todo
sigma0 = tf.nn.moments(X_trunc, [0])[1]
sigmas = tf.Variable(tf_repeat(sigma0, K), 'sigmas')

# E-step
pis = responsibilities(X, mus, sigmas, alphas, K, p)

# M-step
#http://cslu.ohsu.edu/~bedricks/courses/cs655/pdf/addl_slides/pr813_lecture06.pdfttp://cslu.ohsu.edu/~bedricks/courses/cs655/pdf/addl_slides/pr813_lecture06.pdf
pisT = tf.transpose(pis)
membership = tf.reduce_sum(pisT, 1)
mu1 = tf.matmul(pisT, X) / tf.expand_dims(membership, 1)
alpha1 = membership / tf.reduce_sum(membership)

# use 

tf.set_random_seed(0)

sess.run(tf.initialize_all_variables())
print('alphas0', sess.run(alphas))
print('mus0', sess.run(mus))
print('sigmas0', sess.run(sigmas))
print('pis', sess.run(pis))
print('alphas1', sess.run(alpha1))
print('mus1', sess.run(mu1))
a, m, s = sess.run(mstep(X, mus, sigmas, alphas, pis))
print('diff alpha', sess.run(tf.reduce_sum(tf.abs(a - alpha1))))
print('diff mu', sess.run(tf.reduce_sum(tf.abs(m - mu1))))
print('sigma', s.shape, '\n', s)



alphas0 [ 0.33333334  0.33333334  0.33333334]
mus0 [[ 0.84375     0.88524592  0.43076923  0.67605633  0.22222222  0.92307693
   0.41860464  0.11111111  0.31428573  0.87037039  0.69565219  0.84507042
   0.40816328  0.86842108  0.29090908  0.25        0.86956519  0.80392158
   0.91780823  0.04938272]
 [ 0.83333331  0.875       0.42105263  0.73333335  0.17391305  0.95999998
   0.27419356  0.06382979  0.09677419  0.47499999  0.69767439  0.69230771
   0.53333336  0.77142859  0.35135135  0.16216215  0.83783782  0.8918919
   0.8888889   0.1147541 ]
 [ 0.72727275  0.953125    0.64179105  0.27710843  0.33870968  0.78181821
   0.5         0.10526316  0.69230771  0.87755102  0.3888889   0.87323946
   0.22033899  0.91666669  0.33333334  0.05660377  0.89393938  0.70175439
   0.89795917  0.9054054 ]]
sigmas0 [[ 0.02759033  0.00194256  0.01775867  0.03035834  0.01180523  0.01866075
   0.03527383  0.0640904   0.06314699  0.02359549  0.03104059  0.00453064
   0.02002618  0.03906937  0.01024649  0.00331