In [1]:
import os
import pickle
import numpy as np
from mne import viz
from scipy import io, stats
from matplotlib import colors, pyplot as plt
from sklearn import model_selection, ensemble, metrics

In [2]:
CAF_DOSE = 200

DATA_PATH = 'C:\\Users\\Philipp\\Documents\\Caffeine\\Features{dose}\\Combined'.format(dose=CAF_DOSE)
RESULTS_PATH = 'C:\\Users\\Philipp\\GoogleDrive\\Caffeine\\results\\randomForest{dose}'.format(dose=CAF_DOSE)
PROJECT_PATH = '..\\data'

STAGES = ['AWA', 'AWSL', 'NREM', 'REM']

In [3]:
sensor_pos = io.loadmat(os.path.join(PROJECT_PATH, 'Coo_caf'))['Cor'].T
sensor_pos = np.array([sensor_pos[1], sensor_pos[0]]).T

In [4]:
with open(os.path.join(DATA_PATH, 'data_avg.pickle'), 'rb') as file:
    data = pickle.load(file)
with open(os.path.join(DATA_PATH, 'labels_avg.pickle'), 'rb') as file:
    y = pickle.load(file)
with open(os.path.join(DATA_PATH, 'groups_avg.pickle'), 'rb') as file:
    groups = pickle.load(file)

In [5]:
feature_names = np.concatenate([[feature + '-' + str(i) for i in range(20)] for feature in data[STAGES[0]].keys() if 'Perm' not in feature and 'SpecSamp' not in feature])

x = {}
for stage in STAGES:
    x[stage] = []
    for feature in data[stage].keys():
        if 'Perm' in feature or 'SpecSamp' in feature:
            continue
        x[stage].append(data[stage][feature])
    x[stage] = np.concatenate(x[stage], axis=1)

In [12]:
iterations = 100

estimator_dict = {}
testing_data_dict = {}

for stage in STAGES:
    testing_data = []
    estimators = []
    avg_score = []
    
    print(f'Training on {len(x[stage])} samples')
    
    counter = 0
    cv = model_selection.LeavePGroupsOut(n_groups=4)
    cv_split = list(cv.split(x[stage], y[stage], groups[stage]))
    for i in np.random.permutation(len(cv_split)):
        train, test = cv_split[i]
        if counter % 50 == 0:
            print(f'{stage} iteration {counter}/{iterations}')
        if counter >= iterations:
            break
        
        clf = ensemble.RandomForestClassifier(n_jobs=-1)

        '''
        params = {
            'n_estimators': [10, 50, 150, 300],
            'max_depth': [10, 40, None],
            'criterion': ['gini', 'entropy'],
            'bootstrap': [True, False],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': [5, 25, 60]
        }
        '''

        params = {
            'n_estimators': [50],
            'max_depth': [10],
            'criterion': ['gini'],
            #'bootstrap': [True, False],
            #'max_features': ['auto', 'sqrt', 'log2'],
            #'min_samples_leaf': [1, 5],
            'class_weight': ['balanced_subsample']
        }
        
        kfold_inner = model_selection.GroupKFold(n_splits=10)
        inner_cross_validation_split = kfold_inner.split(x[stage][train],
                                                         y[stage][train],
                                                         groups[stage][train])

        grid_search = model_selection.GridSearchCV(estimator=clf,
                                                   param_grid=params,
                                                   cv=inner_cross_validation_split,
                                                   iid=False,
                                                   refit=True,
                                                   n_jobs=-1)
        grid_search.fit(x[stage][train], y[stage][train], groups[stage][train])
        
        testing_data.append((x[stage][test], y[stage][test]))
        estimators.append(grid_search.best_estimator_)
        avg_score.append(grid_search.best_estimator_.score(x[stage][test], y[stage][test]))
        counter += 1

    testing_data_dict[stage] = testing_data
    estimator_dict[stage] = estimators
    
    print('mean score:', np.mean(avg_score), '\n')
    
with open(os.path.join(RESULTS_PATH, 'estimators.pickle'), 'wb') as file:
    pickle.dump(estimator_dict, file)
with open(os.path.join(RESULTS_PATH, 'testing_data.pickle'), 'wb') as file:
    pickle.dump(testing_data_dict, file)
with open(os.path.join(RESULTS_PATH, 'feature_names.pickle'), 'wb') as file:
    pickle.dump(feature_names, file)

Training on 78 samples
AWA iteration 0/100
AWA iteration 50/100
AWA iteration 100/100
mean score: 0.5225 

Training on 74 samples
AWSL iteration 0/100
AWSL iteration 50/100
AWSL iteration 100/100
mean score: 0.56875 

Training on 78 samples
NREM iteration 0/100
NREM iteration 50/100
NREM iteration 100/100
mean score: 0.57125 

Training on 78 samples
REM iteration 0/100
REM iteration 50/100
REM iteration 100/100
mean score: 0.5375 

