In [1]:
# Imports

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import tensorflow as tf
import numpy as np
import pandas as pd
from itertools import *
import sklearn
import math
import random
import sys
import multiprocessing
import scipy
from joblib import Parallel, delayed
import threading
nproc = max(1, multiprocessing.cpu_count() - 1)

if 'utils' not in sys.path:
    sys.path.append('utils')

import data_loader
from tf_gmm_em import *

# Warnings

import warnings
warnings.filterwarnings('ignore')

# Idempotent, cached data retrieval script

print(data_loader.load_chromosome.__doc__)
train_df, test_df, train_ix, test_ix, train_tissues, tfs = \
    data_loader.load_chromosome_cached('1')

Idempotent data loading. For a given chromosome n (a string).
    
    Returns (train_df, test_df, train_ix, test_ix, train_tissues, tfs)
    
    The first two are the train and test dataframes, and test_ix are the
    values in test_df['assayed'] that are missing and need to be imputed (with the
    correct answer being in test_df['filled'] in the corresponding locations.
    train_ix are the assayed (known) methylation values from limited microarray
    sampling (e.g., test_df['assayed'].iloc[train_ix] can be used for prediction of
    test_df['filled'].iloc[test_ix], and the former should be about equal to
    test_df['filled'].iloc[train_ix] (two different ways of sampling methylation).
    
    Imports genetic context and adds those columns to the parameter df, returning
    a merged one. tfs is the list of names of new transcription
    factors.
    
    train_tissues is a list of the names of columns with chromosome methylation values.
    
    Note that loading from scratch ma

In [2]:
train_df_int = train_df
for i in train_tissues:
    train_df_int[i] = data_loader.local_impute(train_df[i].copy())
# print('nans in interpolated', np.isnan(train_df_int[train_tissues]).sum().sum())

In [3]:
# Make sure N, D are small so the numerically unstable verification code
# doesn't underflow.
N = 5
D = 10

X = np.random.normal(size=(N, D))
mu = X.mean(axis=0)
sigma = X.std(axis=0)
mus = np.array([mu, mu, mu * 2])
sigmas = np.array([sigma, sigma * 2, sigma])
K = len(sigmas)
alphas = np.random.dirichlet(np.ones(K), 1)[0]

mean_ll, resp = sess.run(estep(*(tf.constant(x) for x in (X, mus, sigmas, alphas))))
def normal_likelihoods(X, mu, sigma):
    exponent = -np.dot((X - mu[np.newaxis, :]) ** 2, 1 / sigma) / 2
    return (2 * np.pi) ** (-D / 2) * np.prod(sigma) ** (-1 / 2) * np.exp(exponent)
actual = np.array([normal_likelihoods(X, mu, sigma) for mu, sigma in zip(mus, sigmas)])
actual = sklearn.preprocessing.normalize(actual * alphas[:, np.newaxis], norm='l1', axis=0)
resp = sklearn.preprocessing.normalize(resp, norm='l1', axis=0)
print('actual likelihoods', actual)
print('log likelihoods   ', resp)
rmses = np.sqrt(sklearn.metrics.mean_squared_error(actual.T, resp.T, multioutput='raw_values'))
for i, rmse in enumerate(rmses):
    print('K={} rmse={}'.format(i, rmse))

NameError: name 'sess' is not defined

In [4]:
# N = 10
# N = 33
# p = 20 # D is better?
# K = 6
# k = 0 # maximum band offset (0 is just diagonal)
tissues_to_cluster = train_tissues
X_np = train_df_int[train_tissues].values.transpose()

# np.random.seed(2)
# rc = np.random.choice(range(len(train_df)), D, replace=False)
# rmu = np.random.choice(range(N), K, replace=False)

# np.random.shuffle(X_np)
# per = N // K
# splits = [X_np[i:i+per] for i in range(0, N // K * K, per)]


# X_trunc = X_np[:N, rc]
# X_trunc = X_np[:N]
# mu_init = X_trunc[rmu]
# mu_init = np.array([x.mean(axis=0) for x in splits])
#mu_init = X_np[19:22]
# print(mu_init.shape)

In [5]:
print(X_np.shape)



(34, 379551)


In [9]:
# Fix a value of K. Perform NUM_RESTARTS random restarts, and pick EM fit with highest mean likelihood 
# X_np_new = data matrix with bad samples deleted.
# X_perm   = permute(X_np_new)
# K        = number of clusters
# NUM_RESTARTS = number of random restarts

K = 5
NUM_RESTARTS = 1

def get_permutation():
    o = np.ones(len(train_df))
    o[train_ix] = 0
    o[test_ix] = 0
    unobserved_untested_ix = np.where(o)[0]
    o = np.zeros(len(train_df))
    o[test_ix] = 1
    unobserved_tested_ix = np.where(o)[0]
    permutation = np.hstack((train_ix, unobserved_tested_ix, unobserved_untested_ix))
    return permutation, unobserved_tested_ix, unobserved_untested_ix

def fit_model(X_train, unobserved_tested_ix, unobserved_untested_ix):
    N = X_train.shape[0]
    tf.reset_default_graph()
    lp = None
    m = None
    s = None
    a = None
    for j in range(NUM_RESTARTS):
        rmu = np.random.choice(range(X_train.shape[0]), K, replace=False)
        mu_init = X_train[rmu]
        cur_lp, cur_m, cur_s, cur_a = fit_em(X_train, mu_init, 100, EPS, 0.1)
        if lp is None or cur_lp > lp:
            lp = cur_lp
            m = cur_m
            s = cur_s
            a = cur_a
    print("Done picking best EM")
    
    all_alphas = np.zeros((K, N))
    for sample in range(N):
        X_test = X_train[sample, :]
        observed = X_test
        marginal_means, marginal_covs, marginal_alphas = marginal_posterior(observed.reshape(1, len(observed)), m, s, a)
        all_alphas[:, sample] = marginal_alphas.transpose()[:,0]
        
    
    
    observed = test_df['filled'][train_ix].values
    marginal_means, marginal_covs, marginal_alphas = marginal_posterior(observed.reshape(1, len(observed)), m, s, a)

    pred = argmax_exp(marginal_means, marginal_covs, marginal_alphas[0])[:len(unobserved_tested_ix)]
    actual = test_df['filled'][unobserved_tested_ix]
    print(len(pred), len(actual))
    rmse = sklearn.metrics.mean_squared_error(actual, pred)
    print('rmse', np.sqrt(rmse)) # rmse of GMM
    r2 = sklearn.metrics.r2_score(actual, pred)
    print('r2', r2)
    return all_alphas, marginal_means, marginal_covs, marginal_alphas, lp, m, s, a
    
    
'''
    observed = X_test[:len(train_ix)]
    marginal_means, marginal_covs, marginal_alphas = marginal_posterior(observed.reshape(1, len(observed)), m, s, a, sess)

    pred = argmax_exp(marginal_means, marginal_covs, marginal_alphas[0])[:len(unobserved_tested_ix)]
    actual = X_test[len(train_ix):len(train_ix)+len(unobserved_tested_ix)]
    print(len(pred), len(actual))
    
    # Compute the rmse and r2
    mse = sklearn.metrics.mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)
    print('rmse', rmse) # rmse of GMM
    r2 = sklearn.metrics.r2_score(actual, pred)
    print('r2', r2)
'''

print("X_np.shape", X_np.shape) 

# delete bad samples
X_np_new = X_np[:][:33]
X_np_new = np.delete(X_np_new, [14,25,26], 0)
print(X_np_new.shape)

perm, unobserved_tested_ix, unobserved_untested_ix = get_permutation()
print(perm)
X_perm = X_np_new[:, perm]

print("K", K)
print("NUM_RESTARTS", NUM_RESTARTS)
print("X_perm.shape", X_perm.shape) 

#KxN responsibilities
all_alphas, marginal_means, marginal_covs, marginal_alphas, marginal_logp, mus, sigs, alphas = fit_model(X_perm, unobserved_tested_ix, unobserved_untested_ix)
print(marginal_alphas)

X_np.shape (34, 379551)
(30, 379551)
[  1084   1154   1214 ..., 378792 378806 379168]
K 5
NUM_RESTARTS 1
X_perm.shape (30, 379551)
Done picking best EM
368411 368411
rmse 0.0816920895421
r2 0.766615879372
[[-1728.81150894 -1766.86748946     0.         -1986.86303949
  -1008.62999452]]


In [10]:
print(all_alphas)
np.save('all_alphas.npy', all_alphas)

[[-34371.25051055 -36417.2580561  -37982.31618604 -39747.34410485
  -43923.12738274 -51668.13070466 -50283.0206933  -44519.76572617
  -25967.71717483 -20138.13803852 -30930.43369295 -47068.77734709
  -14748.85333695 -24493.62938383 -15336.68869708  -8210.51786708
  -25576.52869297      0.         -55380.57002862 -45198.52900181
  -52256.08215395 -51096.70536267 -37602.73122082 -49561.32648987
  -32733.26690678 -20580.66763318      0.         -19874.9379456
  -47029.00398864 -30056.0241815 ]
 [     0.              0.              0.              0.         -46415.83846135
  -55731.88471541 -53669.04180056 -46843.06351924 -46130.13978221      0.
  -17053.4189288  -63929.61949275 -31095.42735754 -39633.52345225
  -33699.01792111 -28907.73903501 -40678.01085388 -36269.35835991
  -58775.321418   -47485.91086052 -54046.92890203 -54226.27344111
  -41687.20946184 -50467.44136672 -37574.25566562 -23282.10151399
  -45664.82831051 -37005.64314673 -49457.3853626  -19327.95225339]
 [-46529.56430162

In [12]:
np.save("mus", mus)
np.save("sigmas", sigs)
np.save("alphas", alphas)

In [None]:
print(marginal_alphas.shape)
print(marginal_covs.shape)
print(marginal_means.shape)

K = marginal_alphas.shape[1]

print("before:", np.sum(marginal_alphas))
alphs = np.exp(marginal_alphas)
print("after:", np.sum(alphs))
alphs = np.ndarray.tolist(alphs[0,:])

SAMPLE_SIZE = 100

# Pick a random clusters
samples = np.random.choice(len(alphas), SAMPLE_SIZE, p=alphas)
cnts = np.bincount(samples)
print(cnts)

print(marginal_means[1,:])

#S = [] #np.zeros((SAMPLE_SIZE, 2))
#for s in samples:
    # Pick a sample from this normal
    #np.random.normal(marginal_means[s,:], marginal_covs[s], size=(K,N))
    
    # Add the sample to the set of samples