In [1]:
from support import init_repo

# start the repository
init_repo()

In [8]:
from support import get_subjects_data
# settings
fs = 256 # Sampling rate
condition = "INNER" # PRONOUNCED, INNER or VISUALIZED
random_state = 46

# Select the useful par of each trial. Time in seconds
t_start = 1.5 # start (in seconds)
t_end = 3.5 # end (in seconds)


data_array, label_array, group_array = get_subjects_data(condition=condition, t_start = t_start, t_end = t_end, fs = fs)
data_array.shape, label_array.shape, group_array.shape

((620, 128, 512), (620,), (620,))

In [9]:
from features import f_mean, f_std, f_ptp, f_var, f_minim, f_maxim, f_argminim, f_argmaxim, f_rms, f_abs_diff_signal, \
    f_skewness, f_kurtosis, generate_features

func_list = [f_mean, f_std, f_ptp, f_var, f_minim, f_maxim, f_argminim, f_argmaxim, f_rms, f_abs_diff_signal, f_skewness, f_kurtosis]

features_array = generate_features(data_array, func_list)
features_array.shape

(620, 1536)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def split_train_test(data, labels, groups, size):
    # Stratify guarantees that the same proportion of the classes will be available in train and test
    x_tr, x_ts, y_tr, y_ts, g_tr, g_ts = train_test_split(data, labels, groups, test_size=size, stratify=y, random_state=random_state)
    # Apply the scaler in the training data
    ss = StandardScaler()
    x_tr = ss.fit_transform(x_tr)
    x_ts = ss.transform(x_ts)
    return x_tr, x_ts, y_tr, y_ts, g_tr, g_ts

In [11]:
# Run nested cross-validation and re-run using the best parameters
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from support import run_cross_validation, get_feature_selection_model, print_report_nested_cross_validation, print_report_classifier

X = features_array
y = label_array

feature_sm = get_feature_selection_model(X, y)

# Apply the Feature Selection Model without scaling the data
X = feature_sm.transform(X)
n_features_before = np.shape(features_array)
print("Feature transformation - number of features: Before {} - After {}".format(np.shape(features_array)[1], np.shape(X)[1]))

splits = [0.10, 0.20, 0.30]

# Run Nested cross-validation
inner_cv = StratifiedGroupKFold(n_splits=5)
outer_cv = StratifiedGroupKFold(n_splits=5)

classifiers = [
    ["Random Forest", RandomForestClassifier(), {'n_estimators': [200, 500, 1000, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy']}],
    ["Linear SVC", LinearSVC(), {'C': [0.00001, 0.0001, 0.0005, 1, 10, 100, 1000], 'dual': (True, False)}],
    ["SVC", SVC(), [{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [0.00001, 0.0001, 0.0005, 1, 10, 100, 1000]},
                    {"kernel": ["linear"], "C": [0.00001, 0.0001, 0.0005, 1, 10, 100, 1000]}, ]
     ]
]

for cls in classifiers:
    best_params = []
    best_scores = []

    for test_size in splits:
        x_train, x_test, y_train, y_test, g_train, g_test = split_train_test(X, y, group_array, test_size)
        clf = GridSearchCV(estimator=cls[1], param_grid=cls[2], cv=inner_cv, n_jobs=-1)
        clf.fit(x_train, y_train, groups=g_train)

        best_params.append(clf.best_params_)
        best_scores.append(clf.best_score_)

    # Get the best parameter
    best_param = best_params[np.argmax(best_scores)]

    acc_list = []
    cross_v_list = []
    # Run the same classifier using the best parameters
    for test_size in splits:
        x_train, x_test, y_train, y_test, g_train, g_test = split_train_test(X, y, group_array, test_size)
        best_param['random_state'] = random_state
        cls[1].set_params(**best_param)
        cls[1].fit(x_train, y_train)
        y_pred = cls[1].predict(x_test)
        acc_list.append(metrics.accuracy_score(y_test, y_pred))
        cross_v_list.append(run_cross_validation(cls[1], outer_cv, x_train, y_train, g_train))

    print('\n{}: {} '.format("Classifier", cls[0]))
    print_report_nested_cross_validation(splits, best_params, best_scores)
    print_report_classifier(splits, acc_list, cross_v_list)
    print(f_std(best_scores))

Feature transformation - number of features: Before 1536 - After 656


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]
  warn(
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]
  warn(
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan


Classifier: Random Forest 
Nested cross-validation:
Split                          Mean CV Score        Best parameter 
90.0/10.0                      nan                  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200, 'random_state': 46}
80.0/20.0                      nan                  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
70.0/30.0                      nan                  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}

Classification:
Split                          Accuracy             Cross validation
90.0/10.0                      0.1935483870967742   nan            
80.0/20.0                      0.20161290322580644  nan            
70.0/30.0                      0.23655913978494625  nan            
nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_base.py", line 447, in predict
    scores = self.decision_function(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_mo


Classifier: Linear SVC 
Nested cross-validation:
Split                          Mean CV Score        Best parameter 
90.0/10.0                      nan                  {'C': 1e-05, 'dual': True, 'random_state': 46}
80.0/20.0                      nan                  {'C': 1e-05, 'dual': True}
70.0/30.0                      nan                  {'C': 1e-05, 'dual': True}

Classification:
Split                          Accuracy             Cross validation
90.0/10.0                      0.25806451612903225  nan            
80.0/20.0                      0.18548387096774194  nan            
70.0/30.0                      0.24193548387096775  nan            
nan


 nan nan nan]
 nan nan nan]
 nan nan nan]
Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 810, in predict
    y = super().predict(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packa


Classifier: SVC 
Nested cross-validation:
Split                          Mean CV Score        Best parameter 
90.0/10.0                      nan                  {'C': 1e-05, 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 46}
80.0/20.0                      nan                  {'C': 1e-05, 'gamma': 0.001, 'kernel': 'rbf'}
70.0/30.0                      nan                  {'C': 1e-05, 'gamma': 0.001, 'kernel': 'rbf'}

Classification:
Split                          Accuracy             Cross validation
90.0/10.0                      0.24193548387096775  nan            
80.0/20.0                      0.20967741935483872  nan            
70.0/30.0                      0.23655913978494625  nan            
nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 810, in predict
    y = super().predict(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 433, in pr

In [6]:
from sklearn.model_selection import LeavePGroupsOut, cross_val_score

# Run by subject
X = features_array
y = label_array

cv = StratifiedGroupKFold(n_splits=5)

feature_sm = get_feature_selection_model(X, y)

# Apply the Feature Selection Model without scaling the data
X = feature_sm.transform(X)
n_features_before = np.shape(features_array)
print("Feature transformation - number of features: Before {} - After {}".format(np.shape(features_array)[1], np.shape(X)[1]))

classifiers = [
    ["Random Forest", RandomForestClassifier(random_state=random_state, max_features='log2', n_estimators= 200, max_depth=8, criterion='entropy')],
    ["Linear SVC", LinearSVC(random_state=random_state, max_iter=10000, C=0.0005)],
    ["SVC", SVC(random_state=random_state, max_iter=10000, C=10, kernel='linear')],
]

for cls in classifiers:
    acc_list = []
    cross_v_list = []
    subject_list = []
    # Run the same classifier using the best parameters

    leave_pgo = LeavePGroupsOut(n_groups=9)

    for train_index, test_index in leave_pgo.split(X, y, group_array):
        subject_list.append(group_array[train_index[0]])
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ss = StandardScaler()
        x_train = ss.fit_transform(x_train)
        x_test = ss.transform(x_test)

        cls[1].fit(x_train, y_train)
        y_pred = cls[1].predict(x_test)
        acc_list.append(metrics.accuracy_score(y_test, y_pred))
        cross_v_list.append(run_cross_validation(cls[1], cv, x_test, y_test, group_array[test_index]))

    print('\n{}: {} '.format("Classifier", cls[0]))
    for index, sub in enumerate(subject_list):
        print("Subject: {} - Accuracy {} - Cross validation {}".format(sub, acc_list[index], cross_v_list[index]))

Feature transformation - number of features: Before 1536 - After 656


ValueError: The groups parameter contains fewer than (or equal to) n_groups (9) numbers of unique groups ([1 2 3]). LeavePGroupsOut expects that at least n_groups + 1 (10) unique groups be present

In [7]:
from sklearn.model_selection import LeaveOneGroupOut

# Leave One Out (LOO)
X = features_array
y = label_array

cv = StratifiedGroupKFold(n_splits=5)

feature_sm = get_feature_selection_model(X, y)

# Apply the Feature Selection Model without scaling the data
X = feature_sm.transform(X)
n_features_before = np.shape(features_array)
print("Feature transformation - number of features: Before {} - After {}".format(np.shape(features_array)[1], np.shape(X)[1]))

classifiers = [
    ["Random Forest", RandomForestClassifier(random_state=random_state, max_features='log2', n_estimators= 200, max_depth=8, criterion='entropy')],
    ["Neural Network", MLPClassifier(random_state=random_state, alpha=1e-09, hidden_layer_sizes=10, max_iter=1800, solver='lbfgs')],
    ["Linear SVC", LinearSVC(random_state=random_state, max_iter=10000, C=10)],
    ["SVC", SVC(random_state=random_state, max_iter=10000, C=10, kernel='linear')],
]

for cls in classifiers:
    acc_list = []
    cross_v_list = []
    group_list = []
    # Run the same classifier using the best parameters

    leave_oo = LeaveOneGroupOut()

    for train_index, test_index in leave_oo.split(X, y, group_array):
        group_list.append(group_array[test_index[0]])
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ss = StandardScaler()
        x_train = ss.fit_transform(x_train)
        x_test = ss.transform(x_test)

        cls[1].fit(x_train, y_train)
        y_pred = cls[1].predict(x_test)
        acc_list.append(metrics.accuracy_score(y_test, y_pred))
        cross_v_list.append(run_cross_validation(cls[1], cv, x_train, y_train, group_array[train_index]))

    print('\n{}: {} '.format("Classifier", cls[0]))
    for index, gp in enumerate(group_list):
        print("Group out: {} - Accuracy {} - Cross validation {}".format(gp, acc_list[index], cross_v_list[index]))

Feature transformation - number of features: Before 1536 - After 656


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\ensemble\_forest.py", line 832, in predict
    proba = self.predict_proba(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\ensemble\_forest


Classifier: Random Forest 
Group out: 1 - Accuracy 0.26 - Cross validation nan
Group out: 2 - Accuracy 0.23333333333333334 - Cross validation nan
Group out: 3 - Accuracy 0.2777777777777778 - Cross validation nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 1176, in predict
    y_pred = self._forward_pass_fast(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packa


Classifier: Neural Network 
Group out: 1 - Accuracy 0.28 - Cross validation nan
Group out: 2 - Accuracy 0.26666666666666666 - Cross validation nan
Group out: 3 - Accuracy 0.22777777777777777 - Cross validation nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_base.py", line 447, in predict
    scores = self.decision_function(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_mo


Classifier: Linear SVC 
Group out: 1 - Accuracy 0.31 - Cross validation nan
Group out: 2 - Accuracy 0.32083333333333336 - Cross validation nan
Group out: 3 - Accuracy 0.39444444444444443 - Cross validation nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 810, in predict
    y = super().predict(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 433, in pr


Classifier: SVC 
Group out: 1 - Accuracy 0.325 - Cross validation nan
Group out: 2 - Accuracy 0.30416666666666664 - Cross validation nan
Group out: 3 - Accuracy 0.32222222222222224 - Cross validation nan


Traceback (most recent call last):
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 261, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 71, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 810, in predict
    y = super().predict(X)
  File "C:\Users\hazem\AppData\Roaming\Python\Python310\site-packages\sklearn\svm\_base.py", line 433, in pr