In [1]:
#Imports
import numpy as np
import h5py as h5
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle

# Useful plotting thingies:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 12, 12

In [2]:
#datasets
data_path = '/projects/nikhil/ADNI_prediction/input_datasets/'

#input data
train_val_file = data_path + 'cli_ct_seg_fused_train_plus_val.pkl'
test_file = '/projects/francisco/data/ADNI/cli_ct_seg_fused_test.pkl'

#k-fold indices (from a saved file)
kf_file = data_path + 'cli_ct_train_valid_KFold_idx.pkl'


In [3]:
#Grab CV data with specific feature columes (independent vars) and specific clinical scale (dependent var)
def load_CV_data(in_file, kf_file, clinical_scale, feature_cols):
    data = pd.read_pickle(in_file)
    all_regex = '|'.join(['^{}'.format(var) for var in feature_cols + clinical_scale])
    all_vars = '|'.join(['^{}'.format(var) for var in feature_cols])
    data_trunc = data.filter(regex=all_regex)
    data_trunc = data_trunc.dropna(how='any')
    X = np.asarray(data_trunc.filter(regex=all_vars), dtype=float)
    y = np.asarray(data_trunc[clinical_scale], dtype=float)
    kf = pickle.load( open(kf_file, "rb" ) )
    X_train = []
    X_valid = []
    y_train = []
    y_valid = []
    for train, valid in kf:        
        X_train.append(X[train])
        X_valid.append(X[valid])
        y_train.append(y[train])
        y_valid.append(y[valid])

    var_names = data_trunc.filter(regex=all_vars).columns
    
    # Return train and validation lists comprising all folds
    return {'X_train':X_train,'X_valid':X_valid,'y_train':y_train,'y_valid':y_valid}, var_names

#Load test data
def load_test_data(in_file, clinical_scale, feature_cols):

    data = pd.read_pickle(in_file)
    data_trunc = data[clinical_scale + feature_cols]
    # remove nans
    data_trunc = data_trunc[np.isfinite(data_trunc[clinical_scale])]
    X = np.asarray(data_trunc[feature_cols],dtype=float)
    y = np.asarray(data_trunc[clinical_scale],dtype=float)
    return {'X':X, 'y':y}

In [4]:
feature_cols = ['CT']
clinical_scale = ['ADAS13']

cv_data, var_names = load_CV_data(train_val_file,kf_file, clinical_scale, feature_cols)
#test_data = load_test_data(test_file, feature_cols, clinical_scale)
print cv_data['X_train'][0].shape

(522, 74)


In [29]:
# Define a model

# The two independent vars are highly correlated, so an ordinary least squares regression may
# be the best idea: the model will be highly sensitive to random errors in the response var. See 
# https://en.wikipedia.org/wiki/Multicollinearity

# Space for hyperparam search
alphas=np.logspace(-5,5,num=100)

# We can instead try a ridge regression:
from sklearn import linear_model
from sklearn.metrics import r2_score
BaseReg=linear_model.RidgeCV
    
class Regression(BaseReg):
    def __init__(self, *args, **kwargs):
        super(Regression, self).__init__(*args,**kwargs)
    

    def fit(self, X, y):
        self = super(Regression, self).fit(X,y)
        df = X.shape[0] - X.shape[1] - 1
        y_hat = self.predict(X)
        sse = np.sum(np.square(y - y_hat), axis=0)
        self.variance_y = sse/df
        self.variance_X = (np.matrix(X).T * np.matrix(X)).I
        self.zscores_ = self.coef_ / np.sqrt(self.variance_y * self.variance_X.diagonal())
        self.pvals_ = 1 - scipy.stats.t.cdf(abs(self.zscores_), df)
    
    def score(self, X, y):
        N, P = X.shape
        y_hat = self.predict(X)
        
        # R2 and adjusted R2:
        self.R2 = r2_score(y, y_hat)
        self.mse = mean_squared_error(y, y_hat)
        self.adj_R2 = self.R2 - (1 - self.R2) * ((N - 1) / (N - P - 1))
        
        self.var_y = np.var(y)
        
        self.ssm = np.sum(np.square(y_hat - y.mean()))
        self.sse = np.sum(np.square(y - y_hat))
        self.sst = np.sum(np.square(y - y.mean()))
        
        self.r = scipy.stats.pearsonr(y, y_hat)
        
        return self.R2
        

if BaseReg==linear_model.RidgeCV:
    reg = Regression(alphas=alphas, normalize=False, fit_intercept=True)
else:
    reg = Regression(normalize=False, fit_intercept=True)

In [30]:
from sklearn.metrics import mean_squared_error, r2_score

# Cross-validation loop
stats = []
coefs = []
alphas = []
zscores = []
pvals = []

for fold in range(len(cv_data['X_train'])):
    X = cv_data['X_train'][fold]
    y = cv_data['y_train'][fold]
    X_v = cv_data['X_valid'][fold]
    y_v = cv_data['y_valid'][fold]

    reg.fit(X, y)
    y_hat = reg.predict(X)
    # validation:
    y_hat_v = reg.predict(X_v)
    mse = mean_squared_error(y_v, y_hat_v)
    reg.score(X_v, y_v)
    stats.append((reg.R2, reg.adj_R2, reg.mse, reg.r[0][0], reg.r[1][0]))
    
    try:
        alphas.append(reg.alpha_)
    except:
        pass
    coefs.append(reg.coef_)
    zscores.append(reg.zscores_)
    pvals.append(reg.pvals_)

In [31]:
def make_table(k, var_names, **kwargs):
    header_mean = ['Mean {}'.format(key) for key in kwargs.keys()]
    header_std = ['Std {}'.format(key) for key in kwargs.keys()]
    header = header_mean + header_std
    header[::2]=header_mean
    header[1::2]=header_std
    header = ['Variables'] + header
    table = np.zeros(shape=(len(var_names),2 * len(kwargs.keys())))
    for i, stat in enumerate(kwargs.values()):
        M=stat.reshape(k,-1).T
        table[:, i*2] = np.mean(M,axis=1)
        table[:, (i*2)+1] = np.std(M,axis=1)
    
    frame = pd.DataFrame(table)
    frame.insert(0, 'Variable', var_names)
    frame.columns = header
    return frame

In [32]:
from IPython.display import display

stats = np.array(stats)
coefs = np.array(coefs)
alphas = np.array(alphas)
zscores = np.array(zscores)
pvals = np.array(pvals)
print 'Mode of regularization param: {}'.format(scipy.stats.mode(alphas)[0])

model=make_table(10, ['Model'], R2=stats[:,0], Adj_R2=stats[:,1], MSE=stats[:,2], r=stats[:,3], pval=stats[:,4])
display(model)
table = make_table(10, var_names, coefs=coefs, zscores=zscores, pvals=pvals)
table=table.sort(['Mean coefs'], ascending=[0])
display(table)




Mode of regularization param: [ 18.3073828]


Unnamed: 0,Variables,Mean Adj_R2,Std Adj_R2,Mean pval,Std pval,Mean MSE,Std MSE,Mean r,Std r,Mean R2,Std R2
0,Model,3.328371,0.181328,0.00053,0.000752,60.170812,10.573001,0.498638,0.077794,0.223876,0.060443


Unnamed: 0,Variables,Mean zscores,Std zscores,Mean pvals,Std pvals,Mean coefs,Std coefs
48,CT_SMA.R,0.720475,0.077005,0.236460,0.023694,2.107468,0.207964
71,CT_ACG.R,0.602481,0.183849,0.276971,0.059919,1.919799,0.584924
12,CT_PCL.L,0.648949,0.065526,0.258799,0.021285,1.911768,0.176688
51,CT_ROL.R,0.345773,0.096979,0.365486,0.035802,1.597056,0.430110
4,CT_ORBmid.L,0.447485,0.081238,0.327899,0.029326,1.533683,0.258842
50,CT_PreCG.R,0.300324,0.036932,0.382114,0.014026,1.285578,0.146590
42,CT_ORBinf.R,0.336163,0.068103,0.368755,0.025492,1.236782,0.254377
5,CT_ORBinf.L,0.327280,0.047781,0.371949,0.017959,1.204865,0.168279
8,CT_IFGoperc.L,0.276965,0.059511,0.391150,0.022884,1.204452,0.255295
15,CT_PoCG.L,0.206632,0.032015,0.418239,0.012457,1.147584,0.181475
