In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy import stats
from sklearn.compose import ColumnTransformer


DEPENDENT_VARIABLE = 'avg2ndhalf'
USE_GRANGER = False
USE_NUMERICAL_PSYCHOMETRICS = True
USE_CATEGORICAL_PSYCHOMETRICS = True

# Read DF containing dependent variable
ydf = pd.read_csv("./dependent.csv")
ydf.set_index('ID', inplace=True)
ydf = ydf[[DEPENDENT_VARIABLE]]



if USE_NUMERICAL_PSYCHOMETRICS:
    numerical_psychometrics = pd.read_csv('./imputed_numerical_psychometrics.csv')
    ydf = pd.merge(ydf, numerical_psychometrics, how='inner', on='ID')
#     ydf.set_index('ID', inplace=True)

if USE_CATEGORICAL_PSYCHOMETRICS:
    categorical_psychometrics = pd.read_csv('./unimputed_categorical_psychometrics.csv')
    categorical_psychometrics['DIAG_01#CODE'] = categorical_psychometrics['DIAG_01#CODE'].astype(str)
    categorical_psychometrics['resting_asleep'] = categorical_psychometrics['resting_asleep'].astype(str)
    categorical_psychometrics['feedback_asleep'] = categorical_psychometrics['feedback_asleep'].astype(str)
    ydf = pd.merge(ydf, categorical_psychometrics, how='inner', on='ID')
    display(ydf[['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']])
# display(ydf)

print("Sanity Check")
r, p = stats.spearmanr(ydf[DEPENDENT_VARIABLE].values, ydf['bids_age'].values)
print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)

y = ydf[DEPENDENT_VARIABLE].values
ydf.drop(columns=[DEPENDENT_VARIABLE, 'ID'], inplace=True)
X = ydf

# display(X)

# causality_df.loc[:, causality_df.columns != 'ID'].to_numpy()


# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = numerical_psychometrics.drop(columns=['ID']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['DIAG_01#CODE', 'resting_asleep', 'feedback_asleep']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

# display(X)

# raise SystemExit("Stop right there!")



# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15)

# Set the parameters by cross-validation
feats = len(X.columns)
subs = len(y)

pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('svr', SVR(verbose=10))
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]

C_OPTIONS = [0.0001, 0.001, 0.1]
C_OPTIONS.reverse()
param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'svr__kernel': ['linear', 'rbf'],
        'svr__C': C_OPTIONS
    }
]

grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

# display(X_train)
# raise SystemExit("Stop right there!")

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

#---------------------------------------------------------------------------------------- 
from sklearn.kernel_ridge import KernelRidge

pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('preprocessor', preprocessor),
    ('reduce_dim', SelectKBest(mutual_info_regression)),
    ('kr', KernelRidge())
])

#setting feature_selection params appropriately
N_FEATURES_OPTIONS = [30, subs, "all"]

A_OPTIONS = [0.0001, 0.001, 0.1]
G_OPTIONS = np.logspace(-2, 2, 5)
C_OPTIONS.reverse()
param_grid = [
    {
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'kr__kernel': ['linear', 'rbf'],
        'kr__alpha': A_OPTIONS,
        'kr__gamma': G_OPTIONS
    }
]


grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, iid=False, verbose=10)

grid.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
print("Grid scores on development set:")
print()
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on", len(y_train), "subjects.")
print("The scores are computed on", len(y_test), "subjects.")
print()
y_true, y_pred = y_test, grid.predict(X_test)
print(r2_score(y_true, y_pred))

Unnamed: 0,DIAG_01#CODE,resting_asleep,feedback_asleep
0,71.09,1.0,0.0
1,0.29,1.0,1.0
2,71.09,1.0,0.0
3,96.26,0.0,0.0
4,5.0,0.0,0.0
...,...,...,...
132,5.0,1.0,0.0
133,3.9,0.0,0.0
134,0.29,0.0,0.0
135,96.36,0.0,0.0


Sanity Check
Spearman r = -0.2044806672421322 R2 =  0.0418123432757876 p =  0.016537248499776006
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    6.8s finished


[LibSVM]Best parameters set found on development set:

{'reduce_dim__k': 137, 'svr__C': 0.1, 'svr__kernel': 'rbf'}

Grid scores on development set:

-0.261 (+/-0.434) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.081 (+/-0.315) for {'reduce_dim__k': 30, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.081 (+/-0.359) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.135 (+/-0.279) for {'reduce_dim__k': 30, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.111 (+/-0.295) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'linear'}
-0.139 (+/-0.284) for {'reduce_dim__k': 30, 'svr__C': 0.0001, 'svr__kernel': 'rbf'}
-149.398 (+/-595.140) for {'reduce_dim__k': 137, 'svr__C': 0.1, 'svr__kernel': 'linear'}
-0.035 (+/-0.270) for {'reduce_dim__k': 137, 'svr__C': 0.1, 'svr__kernel': 'rbf'}
-0.670 (+/-2.899) for {'reduce_dim__k': 137, 'svr__C': 0.001, 'svr__kernel': 'linear'}
-0.136 (+/-0.283) for {'reduce_dim__k': 137, 'svr__C': 0.001, 'svr__kernel': 'rbf'}
-0.0

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 169 tasks      | elapsed:   