In [1]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer


DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True
ONLY_CONTROL = True

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')
    display(ydf[['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']])

print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

if ONLY_CONTROL:
    ydf= ydf[ydf['DIAG_01#CODE'].isin(['71.09'])]
# run preprocessor
# We create the preprocessing pipelines for both numeric and categorical data.

y = ydf[DEPENDENT_VARIABLE].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID'], inplace=True)
X = ydf

# Set the parameters by cross-validation
feats = len(X.columns)
subs = len(y)

# display(X)

numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X = preprocessor.fit_transform(X)

X_visible = pd.DataFrame(data=X, index=ydf.index)

print(X.shape)
display(X_visible)


# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15)



# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

# raise SystemExit("Stop right there!")

# causality_df.loc[:, causality_df.columns != 'ID'].to_numpy()

Unnamed: 0,DIAG_01#CODE,resting_asleep,feedback_asleep
0,71.09,1.0,0.0
1,0.29,1.0,1.0
2,71.09,1.0,0.0
3,96.26,0.0,0.0
4,5.0,0.0,0.0
...,...,...,...
133,5.0,1.0,0.0
134,3.9,0.0,0.0
135,0.29,0.0,0.0
136,96.36,0.0,0.0


Sanity Check
Spearman r = -0.19784562897414026 R2 =  0.03914289290417317 p =  0.020015289581410488
(63, 222)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,212,213,214,215,216,217,218,219,220,221
0,-0.651350,-0.666917,-0.672154,-0.527447,0.729642,-0.583950,-0.929108,0.372001,-0.697409,0.893835,...,-0.285671,0.273190,0.003382,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.651350,-0.666917,-0.713562,-0.301414,0.460563,0.428914,0.529077,0.372001,-0.692967,-0.882128,...,-0.485908,-0.882821,-0.848964,1.0,0.0,1.0,0.0,1.0,0.0,0.0
5,0.694061,0.654334,0.699379,2.310531,-1.378144,1.033365,3.249571,-2.270303,2.059570,0.893835,...,-1.647283,-1.511827,-1.594768,1.0,0.0,1.0,0.0,0.0,1.0,0.0
6,-0.785891,-0.666917,-0.649634,-0.627907,0.953875,-0.387912,-0.646177,0.027353,0.139531,0.893835,...,-0.085434,0.273190,0.109926,1.0,1.0,0.0,0.0,1.0,0.0,0.0
7,0.559520,0.654334,0.688119,0.251113,0.236330,2.274940,0.572605,0.257119,0.137862,-1.477726,...,-0.686146,-1.426826,-1.346167,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,-1.054973,-1.063293,-1.116014,-1.707845,0.460563,-0.600287,-1.016164,0.486884,-1.280939,-0.018557,...,-0.285671,1.157198,0.571613,1.0,1.0,0.0,0.0,1.0,0.0,0.0
126,0.290438,0.257959,0.265689,0.376687,-0.929679,-0.306229,-0.145606,0.257119,-0.556865,-0.018428,...,1.636604,1.463201,1.601532,1.0,1.0,0.0,0.0,1.0,0.0,0.0
127,-1.189514,-1.195418,-1.157421,0.251113,-0.122442,0.020501,-0.156488,0.257119,-1.262173,-0.018689,...,-0.485908,0.715194,0.216469,1.0,0.0,1.0,0.0,0.0,1.0,0.0
129,-1.189514,-1.195418,-1.168318,-0.803711,0.012098,-1.449786,-0.744114,0.486884,-1.391528,-0.018321,...,1.796794,1.735203,1.779104,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [2]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('svr', SVR(verbose=10))
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]

C_OPTIONS = [0.0001, 0.001, 0.1]
C_OPTIONS.reverse()
param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'svr__kernel': ['linear', 'rbf'],
        'svr__C': C_OPTIONS
    }
]

grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

print(X_train.shape)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(grid.best_params_)
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

(53, 222)
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    7.0s finished


[LibSVM]Best parameters set found on development set:
{'reduce_dim__k': 'all', 'svr__C': 0.001, 'svr__kernel': 'linear'}
Grid scores on development set:

-0.422 (+/-0.745) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.102 (+/-0.313) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.100 (+/-0.328) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.089 (+/-0.191) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.091 (+/-0.207) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'linear'}
-0.089 (+/-0.188) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'rbf'}
-0.943 (+/-2.054) for {'reduce_dim__k': 63, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.085 (+/-0.345) for {'reduce_dim__k': 63, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.068 (+/-0.360) for {'reduce_dim__k': 63, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.089 (+/-0.189) for {'reduce_dim__k': 63, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.085 

In [3]:
from sklearn import ensemble

pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('gbr', ensemble.GradientBoostingRegressor())
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]


param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'gbr__n_estimators': [100, 500],
        'gbr__max_depth': [4],
        'gbr__min_samples_split': [2],
        'gbr__learning_rate': [0.01],
        'gbr__loss': ['ls']
    }
]


grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    2.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    3.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.3s finished


Best parameters set found on development set:

{'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}

Grid scores on development set:

-0.266 (+/-0.706) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}
-0.333 (+/-0.524) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 63}
-0.359 (+/-0.350) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 'all'}
-0.563 (+/-0.795) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 500, 'reduce_dim__k': 30}
-0.587 (+/-0.629) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_sa

In [15]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# attempting Support Vector Classification
DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True
ONLY_CONTROL = True

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')

if ONLY_CONTROL:
    ydf= ydf[ydf['DIAG_01#CODE'].isin(['71.09'])]
    
print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

def categorize(responder):
    if responder > 0.05:
        return 'responder'
    elif responder < 0.05 and responder > -0.05:
        return 'unknown'
    else:
        return 'non-responder'



# ydf['binarized_dependent'] = ydf[DEPENDENT_VARIABLE] > 0.025
ydf['binarized_dependent'] = ydf[DEPENDENT_VARIABLE].map(categorize)


y = ydf['binarized_dependent'].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID'], inplace=True)
X = ydf


# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)
# display(X)

# run preprocessor
# We create the preprocessing pipelines for both numeric and categorical data.
feats = len(X.columns)
subs = len(y)

numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X = preprocessor.fit_transform(X)

splitnum = 0
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    splitnum += 1
    print("split", splitnum)
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
#     print(X_train, X_test, y_train, y_test)

    # Set the parameters by cross-validation
    tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                        {'kernel': ['linear'], 'C': [0.5, 1, 10, 100, 1000]}]

    scores = ['precision', 'recall']

    for score in scores:
#         print("# Tuning hyper-parameters for %s" % score)
#         print()

        clf = GridSearchCV(
            SVC(), tuned_parameters, scoring='%s_macro' % score, cv = 10
        )
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print()
        print("The model is trained on", len(y_train), "subjects.")
        print("The scores are computed on", len(y_test), "subjects.")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

        both = np.array([y_true, y_pred])

        diff = np.arange(len(y_true))[y_true==y_pred]
        print("Classifier was", len(diff)/len(y_true) * 100, "percent correct on the test set.")

        print("Predicted Categories")
        print(y_pred)

        print("Actual Categories")
        print(y_true)
        print()
#     Type is preserved for boolean arrays, 
# so the result will contain False when consecutive elements are the same and True when they differ.

# explanation of precision and recall
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html


Sanity Check
Spearman r = -0.4328034210059029 R2 =  0.18731880123441283 p =  0.0003960132851266732
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 0.5, 'kernel': 'linear'}

Grid scores on development set:

0.611 (+/-0.553) for {'C': 0.5, 'kernel': 'linear'}
0.611 (+/-0.553) for {'C': 1, 'kernel': 'linear'}
0.611 (+/-0.553) for {'C': 10, 'kernel': 'linear'}
0.611 (+/-0.553) for {'C': 100, 'kernel': 'linear'}
0.611 (+/-0.553) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:


The model is trained on 56 subjects.
The scores are computed on 7 subjects.
The model is trained on the full development set.
The scores are computed on the full evaluation set.

               precision    recall  f1-score   support

non-responder       0.50      1.00      0.67         1
    responder       1.00      0.83      0.91         6

     accuracy                           0.86         7
    macro avg       0.75      0.92      0.79         7

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Best parameters set found on development set:

{'C': 0.5, 'kernel': 'linear'}

Grid scores on development set:

0.677 (+/-0.494) for {'C': 0.5, 'kernel': 'linear'}
0.677 (+/-0.494) for {'C': 1, 'kernel': 'linear'}
0.677 (+/-0.494) for {'C': 10, 'kernel': 'linear'}
0.677 (+/-0.494) for {'C': 100, 'kernel': 'linear'}
0.677 (+/-0.494) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:


The model is trained on 56 subjects.
The scores are computed on 7 subjects.
The model is trained on the full development set.
The scores are computed on the full evaluation set.

               precision    recall  f1-score   support

non-responder       0.50      1.00      0.67         1
    responder       1.00      0.83      0.91         6

     accuracy                           0.86         7
    macro avg       0.75      0.92      0.79         7
 weighted avg       0.93      0.86      0.87         7


Classifier was 85.71428571428571 percent correct on the test set.
Predicted Catego