In [1]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer


DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True
ONLY_CONTROL = True

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')
    display(ydf[['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']])

print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

if ONLY_CONTROL:
    ydf= ydf[ydf['DIAG_01#CODE'].isin(['71.09'])]
# run preprocessor
# We create the preprocessing pipelines for both numeric and categorical data.

y = ydf[DEPENDENT_VARIABLE].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID'], inplace=True)
X = ydf

# Set the parameters by cross-validation
feats = len(X.columns)
subs = len(y)

# display(X)

numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X = preprocessor.fit_transform(X)

X_visible = pd.DataFrame(data=X, index=ydf.index)

print(X.shape)
display(X_visible)


# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15)



# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

# raise SystemExit("Stop right there!")

# causality_df.loc[:, causality_df.columns != 'ID'].to_numpy()

Unnamed: 0,DIAG_01#CODE,resting_asleep,feedback_asleep
0,71.09,1.0,0.0
1,0.29,1.0,1.0
2,71.09,1.0,0.0
3,96.26,0.0,0.0
4,5.0,0.0,0.0
...,...,...,...
133,5.0,1.0,0.0
134,3.9,0.0,0.0
135,0.29,0.0,0.0
136,96.36,0.0,0.0


Sanity Check
Spearman r = -0.19784562897414026 R2 =  0.03914289290417317 p =  0.020015289581410488
(63, 222)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,212,213,214,215,216,217,218,219,220,221
0,-0.651350,-0.666917,-0.672154,-0.527447,0.729642,-0.583950,-0.929108,0.372001,-0.697409,0.893835,...,-0.285671,0.273190,0.003382,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.651350,-0.666917,-0.713562,-0.301414,0.460563,0.428914,0.529077,0.372001,-0.692967,-0.882128,...,-0.485908,-0.882821,-0.848964,1.0,0.0,1.0,0.0,1.0,0.0,0.0
5,0.694061,0.654334,0.699379,2.310531,-1.378144,1.033365,3.249571,-2.270303,2.059570,0.893835,...,-1.647283,-1.511827,-1.594768,1.0,0.0,1.0,0.0,0.0,1.0,0.0
6,-0.785891,-0.666917,-0.649634,-0.627907,0.953875,-0.387912,-0.646177,0.027353,0.139531,0.893835,...,-0.085434,0.273190,0.109926,1.0,1.0,0.0,0.0,1.0,0.0,0.0
7,0.559520,0.654334,0.688119,0.251113,0.236330,2.274940,0.572605,0.257119,0.137862,-1.477726,...,-0.686146,-1.426826,-1.346167,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,-1.054973,-1.063293,-1.116014,-1.707845,0.460563,-0.600287,-1.016164,0.486884,-1.280939,-0.018557,...,-0.285671,1.157198,0.571613,1.0,1.0,0.0,0.0,1.0,0.0,0.0
126,0.290438,0.257959,0.265689,0.376687,-0.929679,-0.306229,-0.145606,0.257119,-0.556865,-0.018428,...,1.636604,1.463201,1.601532,1.0,1.0,0.0,0.0,1.0,0.0,0.0
127,-1.189514,-1.195418,-1.157421,0.251113,-0.122442,0.020501,-0.156488,0.257119,-1.262173,-0.018689,...,-0.485908,0.715194,0.216469,1.0,0.0,1.0,0.0,0.0,1.0,0.0
129,-1.189514,-1.195418,-1.168318,-0.803711,0.012098,-1.449786,-0.744114,0.486884,-1.391528,-0.018321,...,1.796794,1.735203,1.779104,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [2]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('svr', SVR(verbose=10))
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]

C_OPTIONS = [0.0001, 0.001, 0.1]
C_OPTIONS.reverse()
param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'svr__kernel': ['linear', 'rbf'],
        'svr__C': C_OPTIONS
    }
]

grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

print(X_train.shape)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(grid.best_params_)
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

(53, 222)
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    7.0s finished


[LibSVM]Best parameters set found on development set:
{'reduce_dim__k': 'all', 'svr__C': 0.001, 'svr__kernel': 'linear'}
Grid scores on development set:

-0.422 (+/-0.745) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.102 (+/-0.313) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.100 (+/-0.328) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.089 (+/-0.191) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.091 (+/-0.207) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'linear'}
-0.089 (+/-0.188) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'rbf'}
-0.943 (+/-2.054) for {'reduce_dim__k': 63, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.085 (+/-0.345) for {'reduce_dim__k': 63, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.068 (+/-0.360) for {'reduce_dim__k': 63, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.089 (+/-0.189) for {'reduce_dim__k': 63, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.085 

In [3]:
from sklearn import ensemble

pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('gbr', ensemble.GradientBoostingRegressor())
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]


param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'gbr__n_estimators': [100, 500],
        'gbr__max_depth': [4],
        'gbr__min_samples_split': [2],
        'gbr__learning_rate': [0.01],
        'gbr__loss': ['ls']
    }
]


grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    2.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    3.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.3s finished


Best parameters set found on development set:

{'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}

Grid scores on development set:

-0.266 (+/-0.706) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}
-0.333 (+/-0.524) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 63}
-0.359 (+/-0.350) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 'all'}
-0.563 (+/-0.795) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 500, 'reduce_dim__k': 30}
-0.587 (+/-0.629) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_sa

In [7]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy import stats
import warnings
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import LeaveOneOut
# warnings.filterwarnings('ignore')

# Classification Options
DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True
ONLY_CONTROL = True
ONLY_PATH = False
VERBOSE = False

if ONLY_CONTROL and ONLY_PATH:
    raise SystemExit("Can't compute both ONLY_CONTROL and ONLY_PATH.")

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    numerical_count = len(numerical_psychometrics.columns) - 1 #subtracting by 1 to account for ID column (which will be removed)
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')

if ONLY_CONTROL:
    ydf = ydf[ydf['DIAG_01#CODE'].isin(['71.09'])]
    
print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

#function for categorizing subjects into "responder" and "non-responder"
def categorize(responder):
    if responder > 0:
        return 'responder'
    else:
        return 'non-responder'

#applying categorization function, creating appropriate data types for X and y
ydf['binarized_dependent'] = ydf[DEPENDENT_VARIABLE].map(categorize)
y = ydf['binarized_dependent'].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID', 'binarized_dependent'], inplace=True)
X = ydf
featnames = X.columns #getting names of features (un-encoded)


if USE_NUMERICAL_PSYCHOMETRICS and USE_CATEGORICAL_PSYCHOMETRICS:
    numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

#scale numeric features and impute and encode categorical features
X = preprocessor.fit_transform(X)
loo = LeaveOneOut()

splitnum = 0
correct_results = 0
incorrect_results = 0
tests = []
preds = []
scores = []
for train_index, test_index in loo.split(X):
    splitnum += 1
    if VERBOSE:
        print("split", splitnum)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Set the parameters by cross-validation
    tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                        {'kernel': ['linear'], 'C': [0.0005, 0.005, 0.05, 0.5, 1, 10]}]

    clf = GridSearchCV(SVC(), tuned_parameters, cv = 10)
    clf.fit(X_train, y_train)
    if VERBOSE:
        print("Best parameters set found on development set:", clf.best_params_)
    y_true, y_pred = y_test, clf.predict(X_test)
    if y_true == y_pred:
        correct_results += 1
    else:
        incorrect_results += 1
    if VERBOSE:
        print("Decision function output =", clf.decision_function(X_test)[0])
    scores += [clf.decision_function(X_test)[0]]
    tests += [y_true[0]]
    preds += [y_pred[0]]
print(correct_results, "correct_results")
print(incorrect_results, "incorrect_results")
tests = np.asarray(tests)
preds = np.asarray(preds)

from sklearn.metrics import roc_auc_score
print(tests)
fpr, tpr, thresholds = metrics.roc_curve(tests, scores, pos_label="responders")
print("AUC score =", roc_auc_score(tests, scores))

corrects = (tests == preds)
actually_good = (tests == 'responder')
print(corrects)
print(actually_good)
print(np.sum(np.bitwise_and(actually_good, corrects)), "of the correct results were responders")
print(np.sum(np.bitwise_and((~actually_good), corrects)), "of the correct results were not responders")
print(np.sum(np.bitwise_and(actually_good, (~corrects))), "of the incorrect results were responders")
print(np.sum(np.bitwise_and((~actually_good), (~corrects))), "of the incorrect results were not responders")
#     Type is preserved for boolean arrays, 
# so the result will contain False when consecutive elements are the same and True when they differ.

# explanation of precision and recall
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html



finalsvc = SVC(C= clf.best_params_['C'], kernel='linear')
finalsvc.fit(X, y)
coefs = finalsvc.coef_[0]
coefs_labels = []
sorted_vals = np.argsort(np.abs(coefs), axis = 0)
if USE_NUMERICAL_PSYCHOMETRICS:
#     print("coefficients from the final run of the SVC, should be slightly different each time, but probably not a big difference")
#     print(coefs)
    for feature in range(len(coefs)):
        if feature < numerical_count:
#             print(sorted_vals[feature])
#             print(feature, sorted_vals[feature], coefs[feature], featnames[feature])
            coefs_labels += [featnames[feature]]
        else:
#             print(feature, sorted_vals[feature], coefs[feature], "non-numerical feature")
            coefs_labels += ["non-numerical feature" + str(feature)]
# print(coefs_labels)
coefs_labels = np.asarray(coefs_labels)
coefs_labels = np.flip(coefs_labels[sorted_vals], axis=0)
coefs = np.flip(coefs[sorted_vals], axis=0)
print(coefs)
print(coefs_labels)
coefs_with_labels = pd.DataFrame(data=np.array([coefs]), columns=coefs_labels)
display(coefs_with_labels)
coefs_with_labels.to_csv("./feature_coefficients_control.csv", index=False)

# print(ydf.columns[sorted_a[0:10]])

Sanity Check
Spearman r = -0.4328034210059029 R2 =  0.18731880123441283 p =  0.0003960132851266732
52 correct_results
11 incorrect_results
['non-responder' 'responder' 'responder' 'responder' 'non-responder'
 'responder' 'responder' 'responder' 'responder' 'responder' 'responder'
 'non-responder' 'responder' 'responder' 'non-responder' 'responder'
 'responder' 'responder' 'responder' 'responder' 'responder' 'responder'
 'responder' 'responder' 'responder' 'non-responder' 'responder'
 'responder' 'responder' 'responder' 'non-responder' 'responder'
 'non-responder' 'responder' 'responder' 'non-responder' 'responder'
 'non-responder' 'responder' 'responder' 'responder' 'responder'
 'responder' 'responder' 'responder' 'responder' 'responder' 'responder'
 'responder' 'responder' 'responder' 'non-responder' 'non-responder'
 'responder' 'non-responder' 'responder' 'responder' 'responder'
 'responder' 'non-responder' 'responder' 'responder' 'responder']
AUC score = 0.7676923076923077
[False  T

Unnamed: 0,VF_24,DKEFSTMT_14,TOWER_50,DF_21,DF_09,DF_20,TOWER_51,VF_05,AGE_04,bids_age,...,VF_07,VF_25,VF_52,DKEFSTMT_26,DKEFSTMT_04,DKEFSTMT_21,VF_35,DKEFSCWI_11,non-numerical feature215,DKEFSTMT_10
0,-0.148432,0.136932,-0.121269,0.10777,-0.103403,0.102371,0.101585,-0.100892,-0.100399,-0.099,...,-0.000822,0.000795,-0.000519,0.000493,0.000481,-0.000318,0.000232,-0.00021,-6.938894e-18,0.0
