In [1]:
# adapted from: http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:
    
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    
    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})
                      
        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]

In [2]:
from __future__ import print_function
import pickle
import os
import scipy.io
from scipy import stats

import pandas as pd
from numpy import *

import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import pandas as pd
import argparse
from sklearn.model_selection import GridSearchCV, cross_val_score,cross_val_predict,StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

##read the data and clean data
def argumentparser():
    parser = argparse.ArgumentParser(description='PyTorch Connectome CNN')
    # hyper-parameters
    parser.add_argument('--dataset', type=int, default=1, help='select a dataset (1:connectome, 2: connectome + morphometry)')
    return parser
def data_fetch_clean(file,type):
    #os.getcwd()
    #os.chdir('../braindata')
    dd =pd.read_csv(file,header=0)
    print(dd.shape)
    import csv

    with open(file, 'r') as f:
        d_reader = csv.DictReader(f)

        #get fieldnames from DictReader object and store in list
        headers = d_reader.fieldnames
    data=np.array(dd)
    #print(data.shape)
    idx_IN_columns = np.array(range(11,data.shape[1]))
    print(idx_IN_columns)
    X=data[:,idx_IN_columns]
    #features=data[:,11:data.shape[1]]
    #features = features.transpose()
    X = stats.zscore(X)
    #print(features.shape)
    y=data[:,type]
    #5: ad-smi / 6:mci-smi / 7:adonly-smi / 8:ad-mci / 9:adonly-mci / 10:adonly - adwithsmallvv


    ind_num=np.isnan(y)
    # print(ind_num.shape)


    y_no_nan = y[~ind_num]

    X_no_nan = X[~ind_num,:]

           # print(y.shape)

    y=y_no_nan
    X=X_no_nan
    feature_num_all=[]
    lr_all_feature=[]
    svm_all_feature=[]
    lr_fls_feature=[]
    svm_fls_feature=[]
    base_labels= []

    np.isnan(X).any()

    X[np.isnan(X)] = np.median(X[~np.isnan(X)])
    return X,y

#classification and cross validation 

In [3]:
from sklearn.svm import SVC
from sklearn import linear_model

In [4]:
save_name=["AD vs SMC","MCI vs SMC","ADonly vs SMC","AD vs MCI","ADonly vs MCI","ADonly vs ADwithsmallvv"]
 #5: ad-smi / 6:mci-smi / 7:adonly-smi / 8:ad-mci / 9:adonly-mci / 10:adonly - adwithsmallvv

filename='data_3_all.csv'
cwd=os.getcwd()
os.chdir('../braindata')


X,y=data_fetch_clean(filename,5)

models1 = { 
    
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC(),
    'linear_model.LogisticRegression':linear_model.LogisticRegression()
    
}

params1 = { 
            'RandomForestClassifier': [{ 'n_estimators': np.arange(10, 500, 50) },
                                       {'min_samples_leaf': np.arange(1, 51, 5)},
                                      ],
    'SVC': [
        {'kernel': ['linear'], 'C': [0.001,0.01,0.1,1, 10]},
    ],
    'linear_model.LogisticRegression':{'C':[0.001, 0.01, 0.1, 1, 10]}
}

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X,y, scoring='accuracy', n_jobs=-1)
helper1.score_summary(sort_by='mean_score')

(208, 34657)
[   11    12    13 ... 34654 34655 34656]


  return (a - mns) / sstd


Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.5s


Running GridSearchCV for SVC.
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   10.9s finished


Running GridSearchCV for linear_model.LogisticRegression.
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    5.9s finished


RandomForestClassifier
SVC
linear_model.LogisticRegression


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    7.6s finished
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,kernel,min_samples_leaf,n_estimators
0,RandomForestClassifier,0.959184,0.986395,1.0,0.019241,,,,10.0
28,linear_model.LogisticRegression,0.979592,0.986395,1.0,0.0096205,1.0,,,
27,linear_model.LogisticRegression,0.979592,0.986395,1.0,0.0096205,0.1,,,
26,linear_model.LogisticRegression,0.979592,0.986395,1.0,0.0096205,0.01,,,
25,linear_model.LogisticRegression,0.979592,0.986395,1.0,0.0096205,0.001,,,
24,SVC,0.959184,0.986395,1.0,0.019241,10.0,linear,,
23,SVC,0.959184,0.986395,1.0,0.019241,1.0,linear,,
22,SVC,0.959184,0.986395,1.0,0.019241,0.1,linear,,
21,SVC,0.959184,0.986395,1.0,0.019241,0.01,linear,,
20,SVC,0.959184,0.986395,1.0,0.019241,0.001,linear,,


In [5]:
#helper1.score_summary(sort_by='mean_score')

In [6]:
for i in range(5,11):
    print(i)

5
6
7
8
9
10
