In [1]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [20]:
# Classification Options
DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = True
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True
ONLY_CONTROL = True
ONLY_PATH = False
VERBOSE = False
DISPLAY_COEFS = True
SHOW_PARAMS = True
SCALE_DEPENDENT = True
RESTING_AWAKE = True
FEEDBACK_AWAKE = False
ONLY_LINEAR_KERNEL = True

In [21]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy import stats
import warnings
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics
from sklearn.metrics import roc_auc_score
# warnings.filterwarnings('ignore')

print("DEPENDENT_VARIABLE =", DEPENDENT_VARIABLE)

if ONLY_CONTROL and ONLY_PATH:
    raise SystemExit("Can't compute both ONLY_CONTROL and ONLY_PATH.")
    
if (ONLY_CONTROL or ONLY_PATH) and not USE_CATEGORICAL_PSYCHOMETRICS:
    raise SystemExit("Can't separate by clinical status without CATEGORICAL PSYCHOMETRICS.")
    
if (RESTING_AWAKE or FEEDBACK_AWAKE) and not USE_CATEGORICAL_PSYCHOMETRICS:
    raise SystemExit("Can't separate by awakeness without CATEGORICAL PSYCHOMETRICS.")

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    numerical_count = len(numerical_psychometrics.columns) - 1 #subtracting by 1 to account for ID column (which will be removed)
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_GRANGER:
    print("Using Granger causality.")
    print("Right now, this means excluding 3 subjects because my code encounters an error when calculating causality for them and I haven't yet figured out why.")
    granger = pd.read_csv('./granger.csv')
    ydf = pd.merge(ydf, granger, how='inner', on='ID')
    
if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')
    
if ONLY_CONTROL:
    print("Using only subjects with no clinical diagnosis (DIAG_01#CODE == V71.09).")
    ydf = ydf[ydf['DIAG_01#CODE'].isin(['71.09'])]
elif ONLY_PATH:
    ydf = ydf[~ydf['DIAG_01#CODE'].isin(['71.09'])]
    print("Using only subjects with a clinical diagnosis (DIAG_01#CODE != V71.09).")
    
if RESTING_AWAKE:
    ydf = ydf[ydf['resting_asleep'].isin(['0.0'])]
    
if FEEDBACK_AWAKE:
    ydf = ydf[ydf['feedback_asleep'].isin(['0.0'])]

    
print()
print("Sanity Check - correlating DEPENDENT_VARIABLE with bids_age")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)
print()

#function for categorizing subjects into "responder" and "non-responder"
def categorize(responder):
    if responder > 0:
        return 'responder'
    else:
        return 'non-responder'

if SCALE_DEPENDENT:
    #scaling y
    print("SCALING DEPENDENT VARIABLE")
    print("'correct' classification is now based on Z-score > or < 0 instead of absolute value. This is very different.")
    scaler = StandardScaler()
    ydf[DEPENDENT_VARIABLE] = scaler.fit_transform(ydf[DEPENDENT_VARIABLE].values.reshape(-1, 1))

#applying categorization function, creating appropriate data types for X and y
ydf['binarized_dependent'] = ydf[DEPENDENT_VARIABLE].map(categorize)
y = ydf['binarized_dependent'].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID', 'binarized_dependent'], inplace=True)
X = ydf
featnames = X.columns #getting names of features (un-encoded)

if USE_NUMERICAL_PSYCHOMETRICS and USE_CATEGORICAL_PSYCHOMETRICS:
    numeric_features = [feat for feat in featnames if feat not in ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']]
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

#scale numeric features and impute and encode categorical features
X = preprocessor.fit_transform(X)
loo = LeaveOneOut()

# Prepare progress bar
split = loo.split(X)
countsplit = 0
for train_index, test_index in split:
    countsplit += 1
print("Running Leave-One-Out (LOO) Cross-Validation")

splitnum = 0
correct_results = 0
incorrect_results = 0
tests = []
preds = []
scores = []
params = 0
with tqdm(total=countsplit) as pbar:
    for train_index, test_index in loo.split(X):
        splitnum += 1
        if VERBOSE:
            print("split", splitnum)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Set the parameters by cross-validation
        if ONLY_LINEAR_KERNEL:
            tuned_parameters = [{'kernel': ['linear'], 'C': [0.0005, 0.005, 0.05, 0.5, 1, 10]}]
        else:
            tuned_parameters = [{'kernel': ['rbf'], 'gamma': ['scale', 1e-3, 1e-4], 'C': [0.0005, 0.005, 0.05, 0.5, 1, 10]},
                                {'kernel': ['poly'], 'gamma': ['scale', 1e-3, 1e-4], 'C': [0.0005, 0.005, 0.05, 0.5, 1, 10]},
                                {'kernel': ['linear'], 'C': [0.0005, 0.005, 0.05, 0.5, 1, 10]}]

        clf = GridSearchCV(SVC(), tuned_parameters, cv = 10, n_jobs=-1)
        clf.fit(X_train, y_train)

        if params != clf.best_params_:
            params = clf.best_params_
            print("Best parameters set found on development set:", params)

        y_true, y_pred = y_test, clf.predict(X_test)
        if y_true == y_pred:
            correct_results += 1
        else:
            incorrect_results += 1
        if VERBOSE:
            print("Decision function output =", clf.decision_function(X_test)[0])
        scores += [clf.decision_function(X_test)[0]]
        tests += [y_true[0]]
        preds += [y_pred[0]]
        pbar.update(1)

tests = np.asarray(tests) #array "test_sets", basically answer key
preds = np.asarray(preds) #array of predicted values, corresponds to answer key by index
fpr, tpr, thresholds = metrics.roc_curve(tests, scores, pos_label="responders")
corrects = (tests == preds) # True if algorithm guessed that one correctly.
actually_good = (tests == 'responder') # True if that subject was truly a responder

print(correct_results, "total correct results")
print(incorrect_results, "total incorrect results")
print(np.sum(np.bitwise_and(actually_good, corrects)), "of the correct results were responders")
print(np.sum(np.bitwise_and((~actually_good), corrects)), "of the correct results were not responders")
print(np.sum(np.bitwise_and(actually_good, (~corrects))), "of the incorrect results were responders")
print(np.sum(np.bitwise_and((~actually_good), (~corrects))), "of the incorrect results were not responders")
print()
print("AUC score =", roc_auc_score(tests, scores))
print()

#Training final run of LOO SVC in order to get the coefficients of the features
finalsvc = SVC(C= clf.best_params_['C'], kernel='linear')
finalsvc.fit(X, y)

#doing some array manipulation to put arrange the features in order by absolute value of coefficent, alongside labels
coefs = finalsvc.coef_[0]
coefs_labels = []
sorted_vals = np.argsort(np.abs(coefs), axis = 0)
for feature in range(len(coefs)):
    if feature < len([feat for feat in featnames if feat not in ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']]):
        coefs_labels += [featnames[feature]]
    else:
        coefs_labels += ["non-numerical feature" + str(feature)]
coefs_labels = np.asarray(coefs_labels)
coefs_labels = np.flip(coefs_labels[sorted_vals], axis=0)
coefs = np.flip(coefs[sorted_vals], axis=0)
coefs_with_labels = pd.DataFrame(data=np.array([coefs]), columns=coefs_labels)

if DISPLAY_COEFS:
    display(coefs_with_labels)
if ONLY_CONTROL:
    print("Wrote feature coefficients to feature_coefficients_control.csv")
    coefs_with_labels.to_csv("./feature_coefficients_control.csv", index=False)
elif ONLY_PATH:
    coefs_with_labels.to_csv("./feature_coefficients_path.csv", index=False)
    print("Wrote feature coefficients to feature_coefficients_path.csv")
else:
    coefs_with_labels.to_csv("./feature_coefficients_all.csv", index=False)
    print("Wrote feature coefficients to feature_coefficients_all.csv")

DEPENDENT_VARIABLE = slope
Using Granger causality.
Right now, this means excluding 3 subjects because my code encounters an error when calculating causality for them and I haven't yet figured out why.
Using only subjects with no clinical diagnosis (DIAG_01#CODE == V71.09).

Sanity Check - correlating DEPENDENT_VARIABLE with bids_age
Spearman r = -0.08999714736727094 R2 =  0.008099486534246282 p =  0.5963067401755895

SCALING DEPENDENT VARIABLE
'correct' classification is now based on Z-score > or < 0 instead of absolute value. This is very different.
Running Leave-One-Out (LOO) Cross-Validation


HBox(children=(FloatProgress(value=0.0, max=37.0), HTML(value='')))

Best parameters set found on development set: {'C': 0.0005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.0005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.05, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.0005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.05, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.0005, 'kernel': 'linear'}
Best parameters set found on development set: {'C': 0.005, 'kernel': 'linear'}

16 total correct results
21 total incorrect results
16 of the correct results were responders
0 of the correct results were not responders
5 of the incorrect results were responders
16 of the incorrect results were not responders

AUC score = 0.38095238095238093





Unnamed: 0,schaefer99\t17Networks_LH_SalVentAttnA_FrMed_2\t197\t58\t252\t0&&&schaefer310\t17Networks_RH_SalVentAttnB_Ins_2\t250\t152\t216\t0,schaefer366\t17Networks_RH_DefaultA_pCunPCC_4\t251\t253\t4\t0&&&schaefer106\t17Networks_LH_SalVentAttnB_Ins_3\t255\t153\t219\t0,schaefer155\t17Networks_LH_DefaultA_pCunPCC_2\t255\t254\t2\t0&&&schaefer154\t17Networks_LH_DefaultA_pCunPCC_1\t255\t254\t1\t0,schaefer159\t17Networks_LH_DefaultA_pCunPCC_6\t255\t254\t6\t0&&&schaefer311\t17Networks_RH_SalVentAttnB_PFCmp_1\t250\t153\t214\t0,VF_17,schaefer159\t17Networks_LH_DefaultA_pCunPCC_6\t255\t254\t6\t0&&&schaefer161\t17Networks_LH_DefaultA_PFCm_1\t254\t255\t1\t0,schaefer155\t17Networks_LH_DefaultA_pCunPCC_2\t255\t254\t2\t0&&&schaefer367\t17Networks_RH_DefaultA_pCunPCC_5\t251\t253\t5\t0,schaefer158\t17Networks_LH_DefaultA_pCunPCC_5\t255\t254\t5\t0&&&schaefer366\t17Networks_RH_DefaultA_pCunPCC_4\t251\t253\t4\t0,schaefer160\t17Networks_LH_DefaultA_pCunPCC_7\t255\t254\t7\t0&&&schaefer104\t17Networks_LH_SalVentAttnB_Ins_1\t255\t153\t217\t0,DKEFSCWI_13,...,WORD_53,non-numerical feature651,non-numerical feature650,DKEFSTMT_16,DKEFSTMT_17,DKEFSTMT_13,DKEFSTMT_12,DKEFSTMT_10,DKEFSTMT_07,DKEFSTMT_09
0,-0.038786,-0.038126,-0.032132,-0.03156,-0.031059,0.030209,-0.029122,0.028902,0.028203,0.028005,...,3.1e-05,-1.734723e-18,-1.734723e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Wrote feature coefficients to feature_coefficients_control.csv
