In [4]:
#from docopt import docopt
import numpy as np
import matplotlib.pyplot as plt

def plot_from_logs(train_file, valid_file, log_title):
    '''
    Training Log Header:
    #Iters Seconds TrainingLoss LearningRate

    Valid Log Header:
    #Iters Seconds TestAccuracy TestLoss

    :param train_file:
    :param valid_file:
    :return:
    '''
    #tX = np.loadtxt(train_file, skiprows=1)    
    #vX = np.loadtxt(valid_file, skiprows=1)
    tX = np.genfromtxt(train_file, dtype=float, delimiter=',',skip_header=1)     
    vX = np.genfromtxt(valid_file, dtype=float, delimiter=',',skip_header=1) 
    
    t_iters = tX[:, 0]
    v_iters = vX[:, 0]
    seconds = tX[:, 1]

    # Training loss and validation accuracy:    
    #plt.subplot(211)
    print t_iters.shape
    p1, = plt.plot(t_iters, tX[:, 3],  label="Training Loss: " + log_title)
    p2, = plt.plot(v_iters, vX[:, 3],  linewidth=2, label="Validation Accuracy: " + log_title)
    plt.xlabel('Iterations')
    plt.ylabel('Loss/Accuracy')
    #plt.xlim([1000,15000])
    plt.legend(bbox_to_anchor=(0.,1.02, 1., 0.102), loc=3, ncol=4, mode="expand", borderaxespad=0.)
    #plt.grid()

    # Learning rate:
    #plt.subplot(212)
    #p3, = plt.plot(t_iters, tX[:, 2], label="Learning Rate: " + log_title)
    #plt.xlabel('Iterations')
    #plt.ylabel('Learning Rate')
    #plt.legend(loc=1)
    #plt.ylim([0,0.01])
    #plt.grid()

    #
    plt.title('Caffe Model')
    #plt.show()


In [None]:
plt.figure()
plt.style.use('ggplot')

log_idx = np.arange(1,11,1)

for lid in log_idx:
    log_format = 'ff_OF{}_T1'.format(str(lid))
    logs_path = '/projects/nikhil/ADNI_prediction/caffe_training_logs/caffe.INFO.'
    train_file = logs_path + 'train_{}'.format(log_format)
    valid_file = logs_path + 'test_{}'.format(log_format)

    plot_from_logs(train_file, valid_file, log_format)

plt.show()

In [1]:
# Extract Features from a trained net via forward pass
import os
import sys
#from docopt import docopt
import numpy as np
import tables as tb
import caffe
import h5py as h5
from scipy import stats
import matplotlib.pyplot as plt


def load_data(data_path, input_node):
    data = tb.open_file(data_path, 'r')
    X = data.get_node('/' + input_node)[:]
    data.close()
    return X


def extract_features(net_file, model_file, target_file, data_path, input_node):
    os.chdir(os.path.dirname(net_file))
    net = caffe.Net(net_file, model_file, caffe.TEST)    
    
    X = load_data(data_path, input_node)    
    BATCH_SIZE = 128
    print 'X shape: {}'.format(X.shape)
    N = X.shape[0]
    iters = int(np.ceil(N / float(BATCH_SIZE)))

    code_layer = net.blobs['clas']
    out_shape = code_layer.data.shape
    print 'out_shape: {}'.format(out_shape)
    X_out = np.zeros(shape=(N, out_shape[1]))

    data_layer = net.blobs.items()[0][1]
    #print 'net.blobs.items()'
    #print net.blobs.items()
    #print net.blobs.items()[0][1]
    
    data_layer.reshape(BATCH_SIZE, X.shape[1]) # TODO: only works for 2-D inputs
    net.reshape()
        
    print 'Extracting features from data...'
    print 'X_out.shape: {}'.format(X_out.shape)
    
    for i in xrange(iters):
        print '.',
        X_b = X[i * BATCH_SIZE: (i+1) * BATCH_SIZE,:]
        batch_sampx = X_b.shape[0]
        # Pad last batch with zeros
        if X_b.shape[0] < BATCH_SIZE:
            print 'Zero-padding last batch with {} rows'.format(BATCH_SIZE-X_b.shape[0])
            X_b = np.vstack((X_b,np.zeros((BATCH_SIZE-X_b.shape[0],X_b.shape[1]))))                       
            
        data_layer.data[...] = X_b
        net.forward()
        X_out[i * BATCH_SIZE: min((i+1) * BATCH_SIZE, N)] = code_layer.data[0:batch_sampx,:].copy()

    #np.save(target_file, X_out)
    #print 'Saved to {}'.format(target_file)
    return X_out

In [12]:
target_file = '/projects/nikhil/ADNI_prediction/caffe_training_logs/test_1'
data_path = '/projects/nikhil/ADNI_prediction/input_datasets/HC_CT_inflated_CV_OuterFold_1_valid_InnerFold_2.h5'
model_file = '/projects/nikhil/ADNI_prediction/caffe_training_logs/_iter_10000.caffemodel'
net_file = '/projects/nikhil/ADNI_prediction/caffe_training_logs/net.prototxt'
data_layer = 'Outer_Fold_1_train_Inner_Fold_2_X'

X_out = extract_features(net_file, model_file, target_file, data_path,data_layer)

X shape: (7291, 22025)
out_shape: (128, 1)
Extracting features from data...
X_out.shape: (7291, 1)
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Zero-padding last batch with 5 rows


In [11]:
labels.shape

(7291,)

In [14]:
label_data_path = '/projects/nikhil/ADNI_prediction/input_datasets/HC_CT_inflated_CV_OuterFold_1_valid_InnerFold_3.h5'
labels_dataset_name = 'Outer_Fold_1_train_Inner_Fold_3_y'
label_dataset = h5.File(label_data_path)
labels = label_dataset[labels_dataset_name][:]

plt.figure()
plt.style.use('ggplot')
plt_row=1
plt_col=1
no_of_plots=1
font_small = 8
font_med = 16
font_large = 24
lable_array = ['Outer_Fold_1_train_Inner_Fold_1']
for i in np.arange(no_of_plots):
    x = np.squeeze(X_out)
    y = np.squeeze(labels)

    plt.subplot(plt_row,plt_col,i+1)
    plt.scatter(x, y, c='crimson', label=lable_array[i],s=40)
    fit = np.polyfit(x,y,1)
    fit_fn = np.poly1d(fit) 
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    if p_value < 0.0001:
        p_value_sig = '<0.0001'
    else:
        p_value_sig = str(p_value)
        
    label_str = 'r-value: {:04.2f}'.format(r_value) + '\n' + 'p-value: ' + p_value_sig + '\n' + 'std_err: {:04.2f}'.format(std_err) 
    # fit_fn is now a function which takes in x and returns an estimate for y
    plt.plot(x, fit_fn(x),linewidth=3, label=label_str)
    #plt.title(model_choice,fontsize=font_large)
    plt.xlabel('Actual Score',fontsize=font_large)
    plt.ylabel('Predicted Score',fontsize=font_large)            
    plt.legend(fontsize=font_med,loc=2)

plt.show()

In [None]:
from math import sqrt
import numpy as np
import h5py as h5
from scipy.stats.mstats_basic import mquantiles
from sklearn.utils.extmath import randomized_svd
import tables as tb
import sys
import os
#import lmdb
import matplotlib.pyplot as plt
from scipy.stats import describe
from sklearn.manifold import TSNE
from sklearn.decomposition import RandomizedPCA
#from activations import visualize_activations, hinton
from scipy.spatial.distance import dice

def get3DVol(HC_input, HC_shape, input_mask):
    flatvol = np.zeros(np.prod(HC_shape))
    flatvol[input_mask] = HC_input
    vol = flatvol.reshape(-1, HC_shape[2]).T
    return vol

def plot_slices(slice_list, baseline_shape, baseline_mask, llimit=0.01, ulimit=0.99, xmin=200, xmax=1600):
    """
    Plot dem slices.
    :param slice_list:
    :param llimit:
    :param ulimit:
    :param num_slices:
    :param xmin:
    :param xmax:
    :return:
    """
    num_slices = len(slice_list)
    plt.style.use('ggplot')
    plt.figure()
    cols = 2
    rows = num_slices / cols
    plt.cla()
    for j, input in enumerate(slice_list):
        quantiles = mquantiles(input[0], [llimit, ulimit])
        wt_vol = get3DVol(input[0], baseline_shape, baseline_mask)
        plt.subplot(rows, cols, j + 1)
        im = plt.imshow(wt_vol[:, xmin:xmax], cmap=plt.cm.Reds, aspect='auto', interpolation='none', vmin=-.06, vmax=0.06)
        plt.grid()
        plt.title(input[1])
        plt.colorbar()
        im.set_clim(quantiles[0], quantiles[1])
        plt.axis('off')
        
def getDice(X,X_hat):    
    X.astype(int)
    X_hat_r = np.round(X_hat)
    X_hat_r.astype(int)
    d=[]
    for i in np.arange(X.shape[0]):
        d.append(dice(X[i,:],X_hat_r[i,:]))
    return d


In [None]:
from sklearn.manifold import TSNE
#layer_names = ['encoder1','encoder2','encoder3','code']
layer_names = ['encoder1','encoder2','code']

act_title = 'test_35_T10k.h5'

layer_acts = []
input_file = '/projects/nikhil/miccai/visuals/train_logs/' + act_title
input_data = h5.File(input_file, 'r')
for layer in layer_names:
    layer_acts.append(input_data[layer][:])

X_hat = input_data['output_Sigmoid'][:]
input_data.close()

sampx = np.shape(X_hat)[0]

sim=0
# simulation
if sim==1:
    input_file = '/projects/nikhil/miccai/input_data_comb/HC_sim_cat4_data_2.h5'
    input_data = h5.File(input_file, 'r')
    features = input_data['train_data_1'][:]
    labels = input_data['train_classes_1'][:]
    input_data.close()

    ind0 = np.where(labels[:sampx] == 0)[0]
    ind1 = np.where(labels[:sampx] == 1)[0]
    ind2 = np.where(labels[:sampx] == 2)[0]
    ind3 = np.where(labels[:sampx] == 3)[0]
    
else:
    input_file = '/projects/nikhil/miccai/visuals/train_logs/ad_cn_test.h5'
    input_data = h5.File(input_file, 'r')
    features = input_data['l_hc_features'][:]
    labels = input_data['label'][:]
    input_data.close()
    ind0 = np.where(labels[:sampx] == 0)[0]
    ind1 = np.where(labels[:sampx] == 1)[0]

X = features[:sampx,:]

#recon_dice = getDice(X, X_hat)
#print "mean dice scores of the test sample reconstructions: " + str(np.mean(recon_dice))

    
plt.figure()
for i in np.arange(len(layer_acts)):
    activations = layer_acts[i]
    print layer_names[i] + " :" + str(np.mean(activations))
    if activations.shape[1] != 1:
        tsne = TSNE(n_components=2, random_state=0, init='pca')
        proj = tsne.fit_transform(activations.astype(float))
    else:
        proj = activations
    
    plt.subplot(2,2,i+1)
    plt.scatter(proj[ind0, 0], proj[ind0, 1], c='mediumturquoise', alpha=0.7,label='AD')
    plt.scatter(proj[ind1, 0], proj[ind1, 1], c='slategray', alpha=0.7,label='CN')
    
    if sim==1:
        plt.scatter(proj[ind2, 0], proj[ind2, 1], c='mediumpurple', alpha=0.7,label='grp3')
        plt.scatter(proj[ind3, 0], proj[ind3, 1], c='darksalmon', alpha=0.7,label='grp4')
        
        
    plt.title(layer_names[i] + ' layer activations')
    plt.legend()

plt.show()

In [None]:
# Visualize inputs vs their reconstructions:
mappings = tb.open_file('/projects/nikhil/miccai/visuals/train_logs/data_mappings.h5', 'r')
baseline_mask = mappings.get_node('/r_datamask')[:]
volmask = mappings.get_node('/r_volmask')[:]
mappings.close()
baseline_shape = volmask.shape

plot_list = []
for x in range(6):
    i = np.random.random_integers(sampx)
    plot_list.append((X[i], 'X {}'.format(i)))
    plot_list.append((np.round(X_hat[i]), 'X_hat {}'.format(i)))
    
plot_slices(plot_list, baseline_shape, baseline_mask)
plt.show()

In [None]:
a=0.7
plt.figure()
plt.hist(np.sum(features[labels==0],axis=1),alpha=a,normed=1,bins=50,label='AD')
plt.hist(np.sum(features[labels==1],axis=1),alpha=a,normed=1,bins=50,label='CN')
plt.legend()
plt.show()

In [None]:
# simulation
input_file = '/projects/nikhil/miccai/input_data_comb/HC_sim_cat4_data_2.h5'
input_data = h5.File(input_file, 'r')
features_all = input_data['train_data_1'][:]
labels = input_data['train_classes_1'][:]
input_data.close()
#Missing HC cases :-P 
vols = np.sum(features_all,axis=1)
empty_rows = np.where(vols!=0)[0]
features = features_all[empty_rows,:]

In [None]:
#PCA
from sklearn.decomposition import RandomizedPCA
from sklearn.externals import joblib
n_components=64
train_pca = RandomizedPCA(n_components=n_components).fit(features)
features_loadings = train_pca.transform(features)    

In [None]:
pca_dice=[]
for i in np.arange(5):
    print i        
    feature_recon = np.dot(features_loadings[:,:i],train_pca.components_[:i,:]) + train_pca.mean_
    pca_dice.append(np.nanmean(getDice(features,feature_recon)))

In [None]:
36000/128