In [4]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer


DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]

if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    ydf = pd.merge(ydf, numerical_psychometrics, how='left', on='ID')

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='left', on='ID')
    display(ydf[['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']])
# display(ydf)

print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

y = ydf[DEPENDENT_VARIABLE].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID'], inplace=True)
X = ydf

# display(X)

# causality_df.loc[:, causality_df.columns != 'ID'].to_numpy()


# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

# display(X)

# raise SystemExit("Stop right there!")



# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15)

# Set the parameters by cross-validation
feats = len(X.columns)
subs = len(y)

# pipe = Pipeline([
#     # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
#     ('reduce_dim', SelectKBest(mutual_info_regression)),
#     ('svr', SVR(verbose=10))
# ])

# #setting feature_selection params appropriately
# N_FEATURES_OPTIONS = [30, subs, "all"]

# C_OPTIONS = [0.0001, 0.001, 0.1]
# C_OPTIONS.reverse()
# param_grid = [
#     {
#         'reduce_dim__k': N_FEATURES_OPTIONS,
#         'svr__kernel': ['linear', 'rbf'],
#         'svr__C': C_OPTIONS
#     }
# ]

# grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

# # display(X_train)
# # raise SystemExit("Stop right there!")

# grid.fit(X_train, y_train)

# print("Best parameters set found on development set:")
# print()
# print(grid.best_params_)
# print()
# print("Grid scores on development set:")
# print()
# means = grid.cv_results_['mean_test_score']
# stds = grid.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, grid.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r"
#           % (mean, std * 2, params))
# print()

# print("Detailed classification report:")
# print()
# print("The model is trained on", len(y_train), "subjects.")
# print("The scores are computed on", len(y_test), "subjects.")
# print()
# y_true, y_pred = y_test, grid.predict(X_test)
# print(r2_score(y_true, y_pred))

Unnamed: 0_level_0,avg2ndhalf
ID,Unnamed: 1_level_1
A00028185,-0.072119
A00032875,-0.069486
A00033747,0.311871
A00034854,0.354473
A00035072,0.752141
...,...
A00066827,0.181018
A00066926,0.410703
A00072203,0.168672
A00073600,0.712602


Unnamed: 0,DIAG_01#CODE,resting_asleep,feedback_asleep
0,71.09,1.0,0.0
1,0.29,1.0,1.0
2,71.09,1.0,0.0
3,96.26,0.0,0.0
4,5.0,0.0,0.0
...,...,...,...
132,5.0,1.0,0.0
133,3.9,0.0,0.0
134,0.29,0.0,0.0
135,96.36,0.0,0.0


Sanity Check
Spearman r = -0.2044806672421322 R2 =  0.0418123432757876 p =  0.016537248499776006


In [2]:
from sklearn import ensemble

params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
from sklearn.kernel_ridge import KernelRidge

pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('gbr', ensemble.GradientBoostingRegressor())
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]


param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'gbr__n_estimators': [100, 500],
        'gbr__max_depth': [4],
        'gbr__min_samples_split': [2],
        'gbr__learning_rate': [0.01],
        'gbr__loss': ['ls']
    }
]


grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    3.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    5.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.6s finished


Best parameters set found on development set:

{'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}

Grid scores on development set:

-0.218 (+/-0.115) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 30}
-0.308 (+/-0.296) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 137}
-0.237 (+/-0.431) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 100, 'reduce_dim__k': 'all'}
-0.439 (+/-0.499) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_samples_split': 2, 'gbr__n_estimators': 500, 'reduce_dim__k': 30}
-0.461 (+/-0.231) for {'gbr__learning_rate': 0.01, 'gbr__loss': 'ls', 'gbr__max_depth': 4, 'gbr__min_s

In [3]:
#---------------------------------------------------------------------------------------- 
# from sklearn.kernel_ridge import KernelRidge

# pipe = Pipeline([
#     # the reduce_dim stage is populated by the param_grid
#     ('preprocessor', preprocessor),
#     ('reduce_dim', SelectKBest(mutual_info_regression)),
#     ('kr', KernelRidge())
# ])

# #setting feature_selection params appropriately
# N_FEATURES_OPTIONS = [30, subs, "all"]

# A_OPTIONS = [0.0001, 0.001, 0.1]
# G_OPTIONS = np.logspace(-2, 2, 5)
# C_OPTIONS.reverse()
# param_grid = [
#     {
#         'reduce_dim__k': N_FEATURES_OPTIONS,
#         'kr__kernel': ['linear', 'rbf'],
#         'kr__alpha': A_OPTIONS,
#         'kr__gamma': G_OPTIONS
#     }
# ]


# grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

# grid.fit(X_train, y_train)

# print("Best parameters set found on development set:")
# print()
# print(grid.best_params_)
# print()
# print("Grid scores on development set:")
# print()
# means = grid.cv_results_['mean_test_score']
# stds = grid.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, grid.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r"
#           % (mean, std * 2, params))
# print()

# print("Detailed classification report:")
# print()
# print("The model is trained on", len(y_train), "subjects.")
# print("The scores are computed on", len(y_test), "subjects.")
# print()
# y_true, y_pred = y_test, grid.predict(X_test)
# print(r2_score(y_true, y_pred))