In [19]:
#Imports
import numpy as np
import h5py as h5
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.cross_validation import KFold
import pickle

# Useful plotting thingies:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 12, 12

In [20]:
#datasets
data_path = '/projects/nikhil/ADNI_prediction/input_datasets/'

#input data
train_val_file = data_path + 'cli_ct_seg_fused_train_plus_val.pkl'
test_file = '/projects/francisco/data/ADNI/cli_ct_seg_fused_test.pkl'

#k-fold indices (from a saved file)
kf_file = data_path + 'cli_ct_train_valid_KFold_idx.pkl'


In [21]:
#Grab CV data with specific feature columes (independent vars) and specific clinical scale (dependent var)
def load_CV_data(in_file, kf_file, clinical_scale, feature_cols):
    data = pd.read_pickle(in_file)
    all_regex = '|'.join(['^{}'.format(var) for var in feature_cols + clinical_scale])
    ct_regex = '|'.join(['^{}'.format(var) for var in feature_cols])
    data_trunc = data.filter(regex=all_regex)
    data_trunc = data_trunc.dropna(how='any')
    print ct_regex
    print data_trunc.filter(regex=ct_regex).columns
    X = np.sum(np.asarray(data_trunc.filter(regex=ct_regex), dtype=float),axis=1)
    y = np.asarray(data_trunc[clinical_scale], dtype=float)
    kf = pickle.load( open(kf_file, "rb" ) )
    X_train = []
    X_valid = []
    y_train = []
    y_valid = []
    for train, valid in kf:        
        X_train.append(X[train])
        X_valid.append(X[valid])
        y_train.append(y[train])
        y_valid.append(y[valid])
    
    # Return train and validation lists comprising all folds
    return {'X_train':X_train,'X_valid':X_valid,'y_train':y_train,'y_valid':y_valid}

#Load test data
def load_test_data(in_file, clinical_scale, feature_cols):

    data = pd.read_pickle(in_file)
    data_trunc = data[clinical_scale + feature_cols]
    # remove nans
    data_trunc = data_trunc[np.isfinite(data_trunc[clinical_scale])]
    X = np.asarray(data_trunc[feature_cols],dtype=float)
    y = np.asarray(data_trunc[clinical_scale],dtype=float)
    return {'X':X, 'y':y}

In [22]:
feature_cols = ['CT']
clinical_scale = ['ADAS13']

cv_data = load_CV_data(train_val_file,kf_file, clinical_scale, feature_cols)
#test_data = load_test_data(test_file, feature_cols, clinical_scale)

^CT
Index([u'CT_REC.L', u'CT_OLF.L', u'CT_ORBsup.L', u'CT_ORBsupmed.L',
       u'CT_ORBmid.L', u'CT_ORBinf.L', u'CT_SFGdor.L', u'CT_MFG.L',
       u'CT_IFGoperc.L', u'CT_IFGtriang.L', u'CT_SFGmed.L', u'CT_SMA.L',
       u'CT_PCL.L', u'CT_PreCG.L', u'CT_ROL.L', u'CT_PoCG.L', u'CT_SPG.L',
       u'CT_IPL.L', u'CT_SMG.L', u'CT_ANG.L', u'CT_PCUN.L', u'CT_SOG.L',
       u'CT_MOG.L', u'CT_IOG.L', u'CT_CAL.L', u'CT_CUN.L', u'CT_LING.L',
       u'CT_FFG.L', u'CT_HES.L', u'CT_STG.L', u'CT_MTG.L', u'CT_ITG.L',
       u'CT_TPOsup.L', u'CT_TPOmid.L', u'CT_ACG.L', u'CT_DCG.L', u'CT_PCG.L',
       u'CT_REC.R', u'CT_OLF.R', u'CT_ORBsup.R', u'CT_ORBsupmed.R',
       u'CT_ORBmid.R', u'CT_ORBinf.R', u'CT_SFGdor.R', u'CT_MFG.R',
       u'CT_IFGoperc.R', u'CT_IFGtriang.R', u'CT_SFGmed.R', u'CT_SMA.R',
       u'CT_PCL.R', u'CT_PreCG.R', u'CT_ROL.R', u'CT_PoCG.R', u'CT_SPG.R',
       u'CT_IPL.R', u'CT_SMG.R', u'CT_ANG.R', u'CT_PCUN.R', u'CT_SOG.R',
       u'CT_MOG.R', u'CT_IOG.R', u'CT_CAL.R', u'CT_CUN.R', 

In [33]:
import scipy
from sklearn.metrics import mean_squared_error, r2_score

# Cross-validation loop
stats = []
coefs = []

for fold in range(len(cv_data['X_train'])):
    X = cv_data['X_train'][fold].ravel()
    y = cv_data['y_train'][fold].ravel()
    X_v = cv_data['X_valid'][fold]
    y_v = cv_data['y_valid'][fold]
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(X,y)
    
    # validation:
    y_hat_v = intercept + slope * X_v
    y_hat = intercept + slope * X
    mse = mean_squared_error(y_v, y_hat_v)
    R2 = r2_score(y_v, y_hat_v)
    r = scipy.stats.pearsonr(y_v.ravel(), y_hat_v)
    stats.append((R2, p_value, mse, r[0], r[1]))
    coefs.append(slope)
    

In [34]:
coefs = np.array(coefs)
stats = np.array(stats)
print 'Mean R^2: {}, STD of R^2: {}'.format(np.mean(stats[:,0]), np.std(stats[:,0]))
print 'Mean p-val: {}, STD of p-val: {}'.format(np.mean(stats[:,1]), np.std(stats[:,1]))
print 'Mean Coefs: {}, STD of Coefs:{}'.format(np.mean(coefs, axis=0), np.std(coefs, axis=0))
print 'Mean MSE (val): {}, STD of MSE (val): {}'.format(np.mean(stats[:,2]), np.std(stats[:,2]))
print 'Mean r (val): {}, STD of r (val): {}'.format(np.mean(stats[:,3]), np.std(stats[:,3]))


Mean R^2: 0.0977452541679, STD of R^2: 0.0705056686766
Mean p-val: 2.87099784776e-16, STD of p-val: 4.44459985743e-16
Mean Coefs: -0.273940278558, STD of Coefs:0.0100645030749
Mean MSE (val): 69.8623487999, STD of MSE (val): 12.0675500678
Mean r (val): 0.356104458657, STD of r (val): 0.0876362572019
