In [13]:
from sklearn.preprocessing import FunctionTransformer

from reskit.norms import binar_norm, wbysqdist
from reskit.norms import spectral_norm

from reskit.features import degrees,  pagerank

from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier 
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import os
import pandas as pd
import numpy as np

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def orig(x):
    return x

In [2]:
from reskit.core import Transformer, Pipeliner

### Функция считывания данных

In [3]:
def get_autism(path_to_read='Data/dti/', distances=True):
    def get_autism_distances(loc_name):
        with open(loc_name, 'r') as f:
            read_data = f.readlines()

        read_data = pd.DataFrame(
            np.array([np.array(item[:-1].split()).astype(int) for item in read_data]))

        return read_data

    def get_distance_matrix(coords):
        if type(coords) == pd.core.frame.DataFrame:
            coords = coords.values
        elif type(coords) != np.ndarray:
            print('Provide either pandas df or numpy array!')
            return -1

        shape = len(coords)
        dist_matrix = np.zeros((shape, shape))
        del shape
        for i in range(len(coords)):
            for j in range(i + 1, len(coords)):
                dist_matrix[i, j] = np.linalg.norm(coords[i, :] - coords[j, :])
                dist_matrix[j, i] = dist_matrix[i, j]
        return dist_matrix

    target_vector = []  # this will be a target vector (diagnosis)
    matrices = []  # this will be a list of connectomes
    all_files = sorted(os.listdir(path_to_read))
    matrix_files = [
        item for item in all_files if 'DTI_connectivity' in item and 'All' not in item]
    distance_files = [
        item for item in all_files if 'DTI_region_xyz_centers' in item and 'All' not in item]

    # for each file in a sorted (!) list of files:
    for filename in matrix_files:

        A_dataframe = pd.read_csv(
            path_to_read + filename, sep='   ', header=None, engine='python')
        A = A_dataframe.values  # we will use a list of numpy arrays, NOT pandas dataframes
        matrices.append(A)# append a matrix to our list
        if "ASD" in filename:
            target_vector.append(1)
        elif "TD" in filename:
            target_vector.append(0)
    asd_dict = {}
    asd_dict['X'] = np.array(matrices)
    asd_dict['y'] = np.array(target_vector)
    if distances:
        dist_matrix_list = []
        for item in distance_files:
            # print(item)
            cur_coord = get_autism_distances(path_to_read + item)
            cur_dist_mtx = get_distance_matrix(cur_coord)
            dist_matrix_list += [cur_dist_mtx]

        asd_dict['dist'] = np.array(dist_matrix_list)

    return asd_dict


### Функция понижения ранга матрицы

In [None]:
def matrix_eig(data, k = 0):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        curs, vecs = np.linalg.eig(data['X'][i])
        indeces_del = range(curs.size)[(curs.size - k):]
        new_data['X'][i] = np.delete(vecs*curs, indeces_del, axis=1).astype('float')
    return new_data

## Сделаем один пайплайн

In [None]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

In [None]:
data = 'Data/dti/'
data = Transformer(get_autism).fit_transform(data)

In [None]:
data = Transformer(matrix_eig, {'data': data, 'k': 40}).fit_transform(data)
data = Transformer(degrees, collect=['degrees']).fit_transform(data)

In [None]:
print data

In [None]:
X, y = data 
print X.shape, y.shape

In [None]:
steps = [('selector', VarianceThreshold()), ('scaler', MinMaxScaler()), ('classifier', LogisticRegression())] 
pipeline = Pipeline(steps)

In [None]:
param_grid = dict(classifier__penalty=['l1', 'l2'])
scoring = 'roc_auc'
grid_clf = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, n_jobs=-1, cv=grid_cv)
grid_clf.fit(X, y)

In [None]:
steps[-1] = steps[-1][0], grid_clf.best_estimator_
pipeline = Pipeline(steps)
scores = cross_val_score(pipeline, X, y, scoring=scoring, cv=eval_cv, n_jobs=-1)
np.mean(scores), np.std(scores)

### Приведу некоторые результаты для различных К

|       scores       |         std        |  k |
|      :------:      |        :---:       | :-:|
| 0.59233333333333338| 0.12568081264324588|  0 |
| 0.59233333333333338| 0.12568081264324588|  1 | 
| 0.59233333333333338| 0.12568081264324588|  2 | 
| 0.59433333333333338| 0.13016271867679058| 10 |
| 0.61133333333333328| 0.16110865898517063| 20 |
| 0.63233333333333341| 0.16096272860510288| 30 |
| 0.63233333333333341| 0.16096272860510288| 40 |
| 0.63233333333333341| 0.16096272860510288| 45 |
| 0.54400000000000004| 0.27115309328864384| 50 |
| 0.21695468036742913| 0.22945079356294823| 75 |
| 0.49466666666666664| 0.21695468036742913| 100|

## Попробуем сделать это, используя класс Papiliner

Здесь возникают проблемы, из-за того, что работая с один пайплайном мы явно можем задать парамеры функции. Тут же сделать это сложнее.

Есть несколько решений:  
1. Задавать k по дефолту в функции  
2. Задавать параметр data функции matrix_eig через стороннюю переменную, расчитанную ранее

На мой взгляд, лучши решением будет первый вариант. Поэтому на нем я и остановился

In [4]:
def orig_vec(data):
    matrices = []
    for i in  data['X']:
        matrices.append(np.hstack(i))
    data['X_vec'] = matrices
    return data

In [5]:
def matrix_eig_k(data, k = 30):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        curs, vecs = np.linalg.eig(data['X'][i])
        indeces_del = range(curs.size)[(curs.size - k):]
        new_data['X'][i] = np.delete(vecs, indeces_del, axis=1).astype('float')
    return orig_vec(new_data)

In [7]:
def matrix_svd_50(data, k = 50):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        A, B, C = np.linalg.svd(data['X'][i])
        indeces_del = range(B.size)[(B.size - k):]
        A_new = np.delete(A, indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=0)
        C_new = np.delete(C, indeces_del, axis=1)
        new_data['X'][i] = A_new.dot(B_new).dot(C_new).astype('float')
    return orig_vec(new_data)

def matrix_svd_100(data, k = 100):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        A, B, C = np.linalg.svd(data['X'][i])
        indeces_del = range(B.size)[(B.size - k):]
        A_new = np.delete(A, indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=0)
        C_new = np.delete(C, indeces_del, axis=1)
        new_data['X'][i] = A_new.dot(B_new).dot(C_new).astype('float')
    return orig_vec(new_data)

def matrix_svd_150(data, k = 150):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        A, B, C = np.linalg.svd(data['X'][i])
        indeces_del = range(B.size)[(B.size - k):]
        A_new = np.delete(A, indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=0)
        C_new = np.delete(C, indeces_del, axis=1)
        new_data['X'][i] = A_new.dot(B_new).dot(C_new).astype('float')
    return orig_vec(new_data)

def matrix_svd_200(data, k = 200):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        A, B, C = np.linalg.svd(data['X'][i])
        indeces_del = range(B.size)[(B.size - k):]
        A_new = np.delete(A, indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=0)
        C_new = np.delete(C, indeces_del, axis=1)
        new_data['X'][i] = A_new.dot(B_new).dot(C_new).astype('float')
    return orig_vec(new_data)

def matrix_svd_250(data, k = 250):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        A, B, C = np.linalg.svd(data['X'][i])
        indeces_del = range(B.size)[(B.size - k):]
        A_new = np.delete(A, indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=1)
        B_new = np.delete(np.diag(B), indeces_del, axis=0)
        C_new = np.delete(C, indeces_del, axis=1)
        new_data['X'][i] = A_new.dot(B_new).dot(C_new).astype('float')
    return orig_vec(new_data)

In [44]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             #('binar', Transformer(binar_norm)),
             #('wbysqdist', Transformer(wbysqdist)),
            ]


normalizers = [#('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('svd_50', Transformer(matrix_svd_50, collect=['X_vec'])),
               ('svd_100', Transformer(matrix_svd_100, collect=['X_vec'])),
               ('svd_150', Transformer(matrix_svd_150, collect=['X_vec'])),
               ('svd_200', Transformer(matrix_svd_200, collect=['X_vec'])),
               ('svd_250', Transformer(matrix_svd_250, collect=['X_vec'])),
               #('degrees', Transformer(degrees, collect=['degrees'])),
               #('degrees_eig', Transformer(degrees_eig, collect=['degrees'])),
               ('low_rank', Transformer(matrix_eig_k, collect=['X_vec']))
               ]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               #('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               #('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [#('UCLAsource', 'origN'),
                 #('UCLAsource', 'origF'),
                 ('UCLAbaseline', 'degrees'),
                 ('UCLAbaseline', 'binar'),
                 ('UCLAbaseline', 'wbysqdist'),
                 ('UCLAbaseline', 'spectral'),
                 ('UCLAbaseline', 'low_rank'),
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax')]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR
1,UCLAsource,origW,spectral,svd_50,var_threshold,minmax,LR
2,UCLAsource,origW,spectral,svd_100,var_threshold,minmax,LR
3,UCLAsource,origW,spectral,svd_150,var_threshold,minmax,LR
4,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR
5,UCLAsource,origW,spectral,svd_250,var_threshold,minmax,LR
6,UCLAsource,origW,spectral,low_rank,var_threshold,minmax,LR


In [45]:
pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Line: 1/7
Line: 2/7
Line: 3/7
Line: 4/7
Line: 5/7
Line: 6/7
Line: 7/7


Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR,0.579433,0.187257,"{'penalty': 'l2', 'C': 0.1, 'max_iter': 50}",0.557667,0.175297,[ 0.56666667 0.56 0.6 0.75 ...
1,UCLAsource,origW,spectral,svd_50,var_threshold,minmax,LR,0.531915,0.173642,"{'penalty': 'l1', 'C': 0.35000000000000003, 'm...",0.575,0.162311,[ 0.6 0.76 0.64 0.85 0.75 0.45 0.35 0....
2,UCLAsource,origW,spectral,svd_100,var_threshold,minmax,LR,0.577128,0.1531,"{'penalty': 'l2', 'C': 0.25, 'max_iter': 50}",0.571667,0.224154,[ 0.76666667 0.76 0.24 0.75 ...
3,UCLAsource,origW,spectral,svd_150,var_threshold,minmax,LR,0.679433,0.168034,"{'penalty': 'l1', 'C': 0.25, 'max_iter': 50}",0.558333,0.206184,[ 0.43333333 0.76 0.64 0.75 ...
4,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR,0.707801,0.142738,"{'penalty': 'l1', 'C': 0.65, 'max_iter': 50}",0.61,0.183793,[ 0.7 0.52 0.28 0.9 0.7 0.65 0.75 0....
5,UCLAsource,origW,spectral,svd_250,var_threshold,minmax,LR,0.562411,0.195565,"{'penalty': 'l1', 'C': 1.0, 'max_iter': 500}",0.525,0.185755,[ 0.4 0.72 0.48 0.35 0.45 0.95 0.65 0....
6,UCLAsource,origW,spectral,low_rank,var_threshold,minmax,LR,0.503369,0.170741,"{'penalty': 'l2', 'C': 0.01, 'max_iter': 50}",0.525333,0.0929301,[ 0.63333333 0.64 0.48 0.5 ...


### Посмотрим, как влияет разные способы взвешивания и нормализации

In [50]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             ('binar', Transformer(binar_norm)),
             ('wbysqdist', Transformer(wbysqdist)),
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               #('svd_150', Transformer(matrix_svd_150, collect=['X_vec'])),
               ('svd_200', Transformer(matrix_svd_200, collect=['X_vec'])),
               ]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               #('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               #('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [#('UCLAsource', 'origN'),
                 #('UCLAsource', 'origF'),
                 ('UCLAbaseline', 'degrees'),
                 ('UCLAbaseline', 'binar'),
                 ('UCLAbaseline', 'wbysqdist'),
                 ('UCLAbaseline', 'spectral'),
                 ('UCLAbaseline', 'low_rank'),
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax')]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,origN,origF,var_threshold,minmax,LR
1,UCLAsource,origW,origN,svd_200,var_threshold,minmax,LR
2,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR
3,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR
4,UCLAsource,binar,origN,origF,var_threshold,minmax,LR
5,UCLAsource,binar,origN,svd_200,var_threshold,minmax,LR
6,UCLAsource,binar,spectral,origF,var_threshold,minmax,LR
7,UCLAsource,binar,spectral,svd_200,var_threshold,minmax,LR
8,UCLAsource,wbysqdist,origN,origF,var_threshold,minmax,LR
9,UCLAsource,wbysqdist,origN,svd_200,var_threshold,minmax,LR


In [51]:
pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Line: 1/12
Line: 2/12
Line: 3/12
Line: 4/12
Line: 5/12
Line: 6/12
Line: 7/12
Line: 8/12
Line: 9/12
Line: 10/12
Line: 11/12
Line: 12/12


Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,origN,origF,var_threshold,minmax,LR,0.607801,0.157715,"{'penalty': 'l2', 'C': 0.1, 'max_iter': 50}",0.593333,0.159332,[ 0.63333333 0.56 0.64 0.85 ...
1,UCLAsource,origW,origN,svd_200,var_threshold,minmax,LR,0.630142,0.210373,"{'penalty': 'l2', 'C': 0.05, 'max_iter': 50}",0.628667,0.179005,[ 0.96666667 0.56 0.36 0.8 ...
2,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR,0.579433,0.187257,"{'penalty': 'l2', 'C': 0.1, 'max_iter': 50}",0.557667,0.175297,[ 0.56666667 0.56 0.6 0.75 ...
3,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR,0.703901,0.141526,"{'penalty': 'l1', 'C': 0.65, 'max_iter': 50}",0.602,0.206485,[ 0.7 0.52 0.2 0.95 0.7 0.65 0.75 0....
4,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.557801,0.117517,"{'penalty': 'l1', 'C': 0.55, 'max_iter': 50}",0.511667,0.126405,[ 0.56666667 0.64 0.36 0.5 ...
5,UCLAsource,binar,origN,svd_200,var_threshold,minmax,LR,0.647872,0.145959,"{'penalty': 'l2', 'C': 0.01, 'max_iter': 50}",0.522,0.143583,[ 0.4 0.84 0.48 0.45 0.4 0.6 0.6 0....
6,UCLAsource,binar,spectral,origF,var_threshold,minmax,LR,0.594149,0.174925,"{'penalty': 'l2', 'C': 0.30000000000000004, 'm...",0.544333,0.127184,[ 0.53333333 0.68 0.48 0.75 ...
7,UCLAsource,binar,spectral,svd_200,var_threshold,minmax,LR,0.716667,0.100271,"{'penalty': 'l1', 'C': 0.35000000000000003, 'm...",0.617,0.173496,[ 0.7 0.52 0.4 0.6 0.7 0.6 0.65 0....
8,UCLAsource,wbysqdist,origN,origF,var_threshold,minmax,LR,0.607801,0.157715,"{'penalty': 'l2', 'C': 0.1, 'max_iter': 50}",0.593333,0.159332,[ 0.63333333 0.56 0.64 0.85 ...
9,UCLAsource,wbysqdist,origN,svd_200,var_threshold,minmax,LR,0.601596,0.157347,"{'penalty': 'l1', 'C': 0.65, 'max_iter': 500}",0.605333,0.148513,[ 0.53333333 0.8 0.52 0.65 ...


### Используем более мощные классификаторы

In [21]:
def matrix_eig_30(data, k = 30):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        curs, vecs = np.linalg.eig(data['X'][i])
        indeces_del = range(curs.size)[(curs.size - k):]
        new_data['X'][i] = np.delete(vecs, indeces_del, axis=1).astype('float')
    return orig_vec(new_data)


In [23]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             #('binar', Transformer(binar_norm)),
             #('wbysqdist', Transformer(wbysqdist)),
            ]


normalizers = [#('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [#('origF', Transformer(orig_vec, collect=['X_vec'])),
               #('svd_150', Transformer(matrix_svd_150, collect=['X_vec'])),
               ('svd_200', Transformer(matrix_svd_200, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_40, collect=['X_vec'])),
               ]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               ('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               ('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [#('UCLAsource', 'origN'),
                 #('UCLAsource', 'origF'),
                 ('UCLAbaseline', 'degrees'),
                 ('UCLAbaseline', 'binar'),
                 ('UCLAbaseline', 'wbysqdist'),
                 ('UCLAbaseline', 'spectral'),
                 ('UCLAbaseline', 'low_rank'),
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax')]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR
1,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,SVC
2,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,SGD
3,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,LR
4,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SVC
5,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SGD


In [19]:
pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/6
Line: 2/6
Line: 3/6
Line: 4/6
Line: 5/6
Line: 6/6


Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,LR,0.703369,0.154357,"{'penalty': 'l1', 'C': 0.6000000000000001, 'ma...",0.612,0.210371,[ 0.8 0.48 0.24 0.9 0.7 0.7 0.65 0....
1,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,SVC,0.562766,0.208227,"{'kernel': 'poly', 'C': 0.4, 'max_iter': 50, '...",0.583333,0.120416,[ 0.73333333 0.8 0.4 0.55 ...
2,UCLAsource,origW,spectral,svd_200,var_threshold,minmax,SGD,0.696277,0.125825,"{'penalty': 'elasticnet', 'loss': 'modified_hu...",0.634,0.17408,[ 0.7 0.6 0.24 0.95 0.6 0.7 0.75 0....
3,UCLAsource,origW,spectral,low_rank_40,var_threshold,minmax,LR,0.49539,0.162425,"{'penalty': 'l2', 'C': 0.30000000000000004, 'm...",0.543667,0.0928493,[ 0.66666667 0.64 0.48 0.5 ...
4,UCLAsource,origW,spectral,low_rank_40,var_threshold,minmax,SVC,0.530674,0.172475,"{'kernel': 'rbf', 'C': 0.001, 'max_iter': 50, ...",0.541,0.107559,[ 0.7 0.64 0.52 0.6 0.4 0.4 0.5 0....
5,UCLAsource,origW,spectral,low_rank_40,var_threshold,minmax,SGD,0.564184,0.199671,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.551667,0.116068,[ 0.76666667 0.68 0.52 0.35 ...


In [5]:
def matrix_eig_30(data, k = 30):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        curs, vecs = np.linalg.eig(data['X'][i])
        indeces_del = range(curs.size)[(curs.size - k):]
        new_data['X'][i] = np.delete(vecs, indeces_del, axis=1).astype('float')
    return orig_vec(new_data)

def matrix_eig_curs_30(data, k = 30):
    new_data = {}
    new_data['y'] = data['y']
    new_data['dist'] = data['dist']
    new_data['X'] = np.zeros(shape = (data['X'].shape[0], data['X'].shape[1], data['X'].shape[1] - k))
    for i in np.arange(data['X'].shape[0]):
        curs, vecs = np.linalg.eig(data['X'][i])
        indeces_del = range(curs.size)[(curs.size - k):]
        vecs = np.delete(vecs, indeces_del, axis=1)
        curs = np.delete(np.diag(curs), indeces_del, axis=1)
        curs = np.delete(np.diag(curs), indeces_del, axis=0)
        new_data['X'][i] = vecs.dot(np.diag(curs)).astype('float')
    return orig_vec(new_data)

In [36]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             ('binar', Transformer(binar_norm))
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_30, collect=['X_vec'])),
               ('low_rank_30_curs', Transformer(matrix_eig_curs_30, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               ('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               ('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax'),
                 ('origW','origN'),
                 ('binar','spectral')
                 ]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR
1,UCLAsource,origW,spectral,origF,var_threshold,minmax,SVC
2,UCLAsource,origW,spectral,origF,var_threshold,minmax,SGD
3,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,LR
4,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SVC
5,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SGD
6,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,LR
7,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SVC
8,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SGD
9,UCLAsource,binar,origN,origF,var_threshold,minmax,LR


In [37]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/18
Line: 2/18
Line: 3/18
Line: 4/18
Line: 5/18
Line: 6/18
Line: 7/18
Line: 8/18
Line: 9/18
Line: 10/18
Line: 11/18
Line: 12/18
Line: 13/18
Line: 14/18
Line: 15/18
Line: 16/18
Line: 17/18
Line: 18/18


In [38]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,origF,var_threshold,minmax,LR,0.579433,0.187257,"{'penalty': 'l2', 'C': 0.1, 'max_iter': 50}",0.557667,0.175297,[ 0.56666667 0.56 0.6 0.75 ...
1,UCLAsource,origW,spectral,origF,var_threshold,minmax,SVC,0.594858,0.189911,"{'kernel': 'linear', 'C': 0.005, 'max_iter': 5...",0.567667,0.189602,[ 0.66666667 0.52 0.64 0.8 ...
2,UCLAsource,origW,spectral,origF,var_threshold,minmax,SGD,0.644326,0.137462,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.609667,0.121413,[ 0.66666667 0.68 0.6 0.85 ...
3,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,LR,0.503369,0.170741,"{'penalty': 'l2', 'C': 0.01, 'max_iter': 50}",0.525333,0.0929301,[ 0.63333333 0.64 0.48 0.5 ...
4,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SVC,0.530674,0.172475,"{'kernel': 'rbf', 'C': 0.0005, 'max_iter': 50,...",0.541,0.107559,[ 0.7 0.64 0.52 0.6 0.4 0.4 0.5 0....
5,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SGD,0.589894,0.200422,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.551667,0.0936572,[ 0.76666667 0.52 0.48 0.45 ...
6,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,LR,0.614007,0.155101,"{'penalty': 'l1', 'C': 0.8, 'max_iter': 500}",0.582667,0.211133,[ 0.46666667 0.84 0.32 0.55 ...
7,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SVC,0.565248,0.212934,"{'kernel': 'poly', 'C': 0.15000000000000002, '...",0.571,0.0746257,[ 0.7 0.6 0.56 0.55 0.5 0.45 0.55 0....
8,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SGD,0.618262,0.236569,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.579333,0.160761,[ 0.33333333 0.6 0.56 0.75 ...
9,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.553014,0.110393,"{'penalty': 'l1', 'C': 0.5, 'max_iter': 50}",0.495,0.132834,[ 0.5 0.64 0.36 0.5 0.5 0.35 0.75 0....


In [7]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]


weighters = [('origW', Transformer(orig)),
             ('binar', Transformer(binar_norm))
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [#('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_30, collect=['X_vec'])),
               ('low_rank_30_curs', Transformer(matrix_eig_curs_30, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               ('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               ('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax'),
                 ('origW','origN'),
                 ('binar','spectral')
                 ]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,LR
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SVC
2,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SGD
3,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,LR
4,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SVC
5,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SGD
6,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,LR
7,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,SVC
8,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,SGD
9,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,minmax,LR


In [8]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/12
Line: 2/12
Line: 3/12
Line: 4/12
Line: 5/12
Line: 6/12
Line: 7/12
Line: 8/12
Line: 9/12
Line: 10/12
Line: 11/12
Line: 12/12


In [9]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,LR,0.503369,0.170741,"{'penalty': 'l2', 'C': 0.01, 'max_iter': 50}",0.525333,0.0929301,[ 0.63333333 0.64 0.48 0.5 ...
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SVC,0.530674,0.172475,"{'kernel': 'rbf', 'C': 0.0005, 'max_iter': 50,...",0.541,0.107559,[ 0.7 0.64 0.52 0.6 0.4 0.4 0.5 0....
2,UCLAsource,origW,spectral,low_rank_30,var_threshold,minmax,SGD,0.583865,0.229359,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.459667,0.0628835,[ 0.46666667 0.44 0.44 0.55 ...
3,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,LR,0.616844,0.142622,"{'penalty': 'l1', 'C': 0.8500000000000001, 'ma...",0.598,0.192863,[ 0.5 0.72 0.36 0.65 1. 0.3 0.5 0....
4,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SVC,0.565248,0.212934,"{'kernel': 'poly', 'C': 0.15000000000000002, '...",0.571,0.0746257,[ 0.7 0.6 0.56 0.55 0.5 0.45 0.55 0....
5,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,minmax,SGD,0.594149,0.19614,"{'penalty': 'elasticnet', 'loss': 'hinge', 'al...",0.594333,0.0740878,[ 0.63333333 0.56 0.6 0.65 ...
6,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,LR,0.729965,0.125421,"{'penalty': 'l1', 'C': 0.35000000000000003, 'm...",0.647667,0.111426,[ 0.46666667 0.6 0.56 0.65 ...
7,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,SVC,0.606738,0.214896,"{'kernel': 'poly', 'C': 0.0005, 'max_iter': 50...",0.629333,0.160678,[ 0.43333333 0.52 0.84 0.8 ...
8,UCLAsource,binar,origN,low_rank_30,var_threshold,minmax,SGD,0.669858,0.147394,"{'penalty': 'elasticnet', 'loss': 'log', 'alph...",0.579667,0.223057,[ 0.76666667 0.68 0.6 0.5 ...
9,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,minmax,LR,0.764716,0.131253,"{'penalty': 'l1', 'C': 1.0, 'max_iter': 100}",0.638667,0.171219,[ 0.56666667 0.48 0.84 0.85 ...


In [10]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             ('binar', Transformer(binar_norm))
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_30, collect=['X_vec'])),
               ('low_rank_30_curs', Transformer(matrix_eig_curs_30, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [#('LR', LogisticRegression()),
               ('RF', RandomForestClassifier()),
               #('SVC', SVC()),
               #('XGB', XGBClassifier(nthread=1)),
               #('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax'),
                 ('origW','origN'),
                 ('binar','spectral')
                 ]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree=[0.01] + [0.05*i for i in range(1,21)],
        learning_rate=[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth=[i for i in range(1,12)],
        n_estimators=[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,origF,var_threshold,origS,RF
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,origS,RF
2,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,origS,RF
3,UCLAsource,binar,origN,origF,var_threshold,origS,RF
4,UCLAsource,binar,origN,low_rank_30,var_threshold,origS,RF
5,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,origS,RF


In [11]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/6
Line: 2/6
Line: 3/6
Line: 4/6
Line: 5/6
Line: 6/6


In [12]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,origF,var_threshold,origS,RF,0.691401,0.131227,"{'n_estimators': 50, 'max_features': 0.1, 'cri...",0.539333,0.143314,[ 0.53333333 0.64 0.42 0.425 ...
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,origS,RF,0.658333,0.199841,"{'n_estimators': 10, 'max_features': 0.25, 'cr...",0.451167,0.201219,[ 0.61666667 0.44 0.28 0.55 ...
2,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,origS,RF,0.690514,0.18179,"{'n_estimators': 10, 'max_features': 'log2', '...",0.486333,0.137741,[ 0.58333333 0.28 0.4 0.525 ...
3,UCLAsource,binar,origN,origF,var_threshold,origS,RF,0.702216,0.135674,"{'n_estimators': 50, 'max_features': 1.0, 'cri...",0.5595,0.225593,[ 0.9 0.66 0.16 0.55 0.325 0.55 0....
4,UCLAsource,binar,origN,low_rank_30,var_threshold,origS,RF,0.653014,0.202752,"{'n_estimators': 100, 'max_features': 0.001, '...",0.433,0.159251,[ 0.7 0.32 0.56 0.35 0.45 0.35 0.45 0....
5,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,origS,RF,0.67695,0.186559,"{'n_estimators': 50, 'max_features': 0.01, 'cr...",0.565333,0.202096,[ 0.53333333 0.08 0.44 0.55 ...


In [17]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [('origW', Transformer(orig)),
             #('binar', Transformer(binar_norm))
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_30, collect=['X_vec'])),
               ('low_rank_30_curs', Transformer(matrix_eig_curs_30, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [#('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               #('SVC', SVC()),
               ('XGB', XGBClassifier(nthread=1)),
               #('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax'),
                 ('origW','origN'),
                 ('binar','spectral')
                 ]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree= [0.1],
        #[0.01] + [0.05*i for i in range(1,21)],
        learning_rate= [0.1],
        #[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth= [3, 7, 10],
        #[i for i in range(1,12)],
        n_estimators=[10, 100],
        #[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,origW,spectral,origF,var_threshold,origS,XGB
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,origS,XGB
2,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,origS,XGB


In [18]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/3
Line: 2/3
Line: 3/3


In [19]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,origW,spectral,origF,var_threshold,origS,XGB,0.715071,0.120388,"{'reg_alpha': 1, 'colsample_bytree': 0.1, 'lea...",0.583667,0.139144,[ 0.76666667 0.48 0.64 0.75 ...
1,UCLAsource,origW,spectral,low_rank_30,var_threshold,origS,XGB,0.578369,0.182775,"{'reg_alpha': 0, 'colsample_bytree': 0.1, 'lea...",0.587667,0.231881,[ 0.86666667 0.4 0.76 0.85 ...
2,UCLAsource,origW,spectral,low_rank_30_curs,var_threshold,origS,XGB,0.630319,0.180899,"{'reg_alpha': 1, 'colsample_bytree': 0.1, 'lea...",0.636333,0.160024,[ 0.73333333 0.32 0.76 0.65 ...


In [20]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

#Only low_rank
weighters = [#('origW', Transformer(orig)),
             ('binar', Transformer(binar_norm))
            ]


normalizers = [('origN', Transformer(orig)),
               ('spectral', Transformer(spectral_norm))
              ]

featurizers = [('origF', Transformer(orig_vec, collect=['X_vec'])),
               ('low_rank_30', Transformer(matrix_eig_30, collect=['X_vec'])),
               ('low_rank_30_curs', Transformer(matrix_eig_curs_30, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler()),
           ('origS', FunctionTransformer(orig))]

#For tests, don`t use XGB, it needs a lot of time
classifiers = [#('LR', LogisticRegression()),
               #('RF', RandomForestClassifier()),
               #('SVC', SVC()),
               ('XGB', XGBClassifier(nthread=1)),
               #('SGD', SGDClassifier())
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

banned_combos = [
                 ('LR', 'origS'),
                 ('SVC', 'origS'),
                 ('SGD', 'origS'),
                 ('RF', 'minmax'),
                 ('XGB', 'minmax'),
                 ('origW','origN'),
                 ('binar','spectral')
                 ]

param_grid = dict(
    LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    ),
    SGD=dict(
        alpha=[0.001, 0.01, 0.1, 0.5, 1.0],
        l1_ratio=[0, 0.2, 0.4, 0.6, 0.8, 1],
        loss=['hinge', 'log', 'modified_huber'],
        n_iter=[50, 100, 200],
        penalty=['elasticnet']
    ),
    SVC=dict(
        C=[0.0005, 0.001, 0.005, 0.01] + [i*0.05 for i in range(1,11)],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100, 150],
    ),
    RF=dict(
        criterion=['entropy', 'gini'],
        max_depth=[3, 5, 7, 10, 20],
        max_features=['log2', 'sqrt'] + [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0],
        n_estimators=[10, 50, 100, 200, 500]
    ),
    XGB=dict(
        colsample_bytree= [0.1],
        #[0.01] + [0.05*i for i in range(1,21)],
        learning_rate= [0.1],
        #[0.01*i for i in range(1,6)] + [0.05*i for i in range(2,11)],
        max_depth= [3, 7, 10],
        #[i for i in range(1,12)],
        n_estimators=[10, 100],
        #[10, 50, 100, 200, 500],
        nthread=[1],
        reg_alpha=[0, 1],
        reg_lambda=[0, 1],
        subsample=[0.5, 0.7, 1]
    )
)

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,origS,XGB
1,UCLAsource,binar,origN,low_rank_30,var_threshold,origS,XGB
2,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,origS,XGB


In [None]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'], scoring=['roc_auc'])

Removed previous results file -- results.csv.
Line: 1/3


In [23]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,binar,origN,origF,var_threshold,origS,XGB,0.64805,0.144437,"{'reg_alpha': 1, 'colsample_bytree': 0.1, 'lea...",0.534,0.104518,[ 0.7 0.52 0.52 0.45 0.5 0.55 0.45 0....
1,UCLAsource,binar,origN,low_rank_30,var_threshold,origS,XGB,0.608511,0.2671,"{'reg_alpha': 1, 'colsample_bytree': 0.1, 'lea...",0.509,0.141524,[ 0.4 0.44 0.4 0.4 0.5 0.8 0.75 0....
2,UCLAsource,binar,origN,low_rank_30_curs,var_threshold,origS,XGB,0.801418,0.129786,"{'reg_alpha': 1, 'colsample_bytree': 0.1, 'lea...",0.672333,0.129572,[ 0.63333333 0.6 0.64 0.75 ...
