In [123]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.pylab as pylab

# Useful plotting thingies:
%matplotlib inline
plt.style.use('ggplot')
pylab.rcParams['figure.figsize'] = 12, 8


In [None]:
# Set up the experimental matrices:
targets = ['ADAS11_bl', 'ADAS13_bl', 'MMSE_bl']
targets_regex = '|'.join(['^{}$'.format(t) for t in targets])
modalities = ['CT', 'L_HC', 'R_HC']
#modalities = ['CT']
modalities_regex = '|'.join(['^{}_'.format(m) for m in modalities])

matrices = []
for split in ['train', 'valid', 'test']:
    frame = pd.read_pickle('/projects/francisco/data/ADNI/cli_ct_seg_fused_{}.pkl'.format(split))
    X_cur = frame.filter(regex=modalities_regex).as_matrix()
    y_cur = frame.filter(regex=targets_regex).as_matrix()
    # Replaces nans with mean: revisit this later
    y_cur[np.isnan(y_cur)] = np.nanmean(y_cur)
    matrices.append((X_cur,y_cur))

var_names = frame.filter(regex=modalities_regex).columns
X, y = matrices[0][0], matrices[0][1]
X_v, y_v = matrices[1][0], matrices[1][1]
X_t, y_t = matrices[2][0], matrices[2][1]



print X.shape, y.shape
print X_v.shape, y_v.shape
print X_t.shape, y_t.shape
print np.sum(np.isnan(y))

(490, 22020) (490, 3)
(98, 22020) (98, 3)
(110, 22020) (110, 3)
0


In [None]:
from sklearn.linear_model import SGDRegressor, BayesianRidge, RidgeCV
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import f_regression
from sklearn import metrics

#k_nums = [2**x for x in range(1,12)]
#k_nums.append(X.shape[1])
#k_nums = [16, 32, 64, 128, 512, 1024, 1512, 2048, 3128, 4096]
k_nums = range(1, X.shape[1], 5)
k_nums.append(X.shape[1])
k_classifiers = []
stats = np.zeros(shape=(len(k_nums), 3, 2, len(targets)))
for i, k in enumerate(k_nums):
    for t in range(0,len(targets)):
        #sel = SelectKBest(f_regression, k=k)
        
        classifier = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, scoring=None, normalize=True)
        sel = RFE(classifier, n_features_to_select=k)
        
        bayes_classifier = BayesianRidge(compute_score=True, normalize=True)
        bsel = RFE(bayes_classifier, n_features_to_select=k)
        
        classifiers = [classifier, bayes_classifier]
        
        # Feature Selection
        sel.fit(X,y[:,t])
        support = sel.get_support(indices=True)
        #support = range(k)
        # Feature transformatio
        X_hat_v = sel.transform(X_v)

        # Train classifiers
        classifier.fit(X_hat_v, y_v[:,t])
        bayes_classifier.fit(X_hat_v, y_v[:,t])
        k_classifiers.append((classifier, bayes_classifier))

        # Predict on validation data and log stats   
        for l, c in enumerate(classifiers):
            y_hat_v = c.predict(X_hat_v)
            stats[i,0,l,t] = k
            stats[i,1,l,t] =  metrics.mean_squared_error(y_hat_v, y_v[:,t])
            stats[i,2,l,t] =  metrics.r2_score(y_hat_v, y_v[:,t])



In [None]:
T = 1 # Current target index

k = stats[:,0,0]
rss = stats[:,1,0]
brss = stats[:,1,1]
vs = stats[:,2,0]
bvs = stats[:,2,1]
stats_table = np.zeros(shape=(len(k_nums), 5, 3))
for i in range(3):
    stats_table[:,:,i] = np.vstack([k[:,i], rss[:,i], brss[:,i], vs[:,i], bvs[:,i]]).T
stats_table.shape

print 

In [None]:
from tabulate import tabulate
for i in range(3):
    print
    print 'Y: {}'.format(targets[i])
    print tabulate(stats_table[:,:,i], ['K', 'MSE', 'Bayes_MSE', 'R2', 'Bayes_R2'])


In [None]:
plt.figure(1)
plt.cla()
plt.xscale('log')
plt.yscale('log')
plt.xticks(k_nums, k_nums)
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.plot(k, rss[:,i], 'g', label='logreg')
    plt.plot(k, brss[:, i], 'b', label='bayesian reg')


In [None]:
for i in range(3):
    plt.subplot(2, 3, i + 1)
    plt.plot(k, vs[:,i], 'g', label='logreg')
    plt.plot(k, bvs[:, i], 'b', label='bayesian reg')
    
plt.xticks(k_nums, k_nums)
plt.show()

In [None]:
# Pick a classifier, plot the feature weights
classifiers_pair = k_classifiers[-1]
c = classifiers_pair[0]
b = classifiers_pair[1]
feats = zip(var_names[support], c.coef_, b.coef_)
feats.sort(reverse=False, key=lambda x: abs(x[1]) + abs(x[2]))
feats = feats[0:25]
# Plot the feature importances of the classifier
plt.figure()
plt.title("Feature Coefficients")
y_pos = np.arange(len(feats))
plt.barh(y_pos, width=zip(*feats)[1], height=0.5, color='r', align="center", alpha=0.5, label='LogReg')
plt.barh(y_pos, width=zip(*feats)[2], height=0.5, color='b', align="center", alpha=0.5, label='Bayesian Reg')
plt.yticks(y_pos, zip(*feats)[0])
plt.legend()
plt.show()