In [1]:
from sklearn.preprocessing import FunctionTransformer

from reskit.norms import binar_norm, wbysqdist
from reskit.norms import spectral_norm

#from reskit.features import degrees,  pagerank

from reskit.core import Transformer, Pipeliner

from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier 
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import os
import pandas as pd
import numpy as np

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def orig(x):
    return x

import copy



In [2]:
def get_autism(path_to_read='../Data/dti/', distances=True):
    def get_autism_distances(loc_name):
        with open(loc_name, 'r') as f:
            read_data = f.readlines()

        read_data = pd.DataFrame(
            np.array([np.array(item[:-1].split()).astype(int) for item in read_data]))

        return read_data

    def get_distance_matrix(coords):
        if type(coords) == pd.core.frame.DataFrame:
            coords = coords.values
        elif type(coords) != np.ndarray:
            print('Provide either pandas df or numpy array!')
            return -1

        shape = len(coords)
        dist_matrix = np.zeros((shape, shape))
        del shape
        for i in range(len(coords)):
            for j in range(i + 1, len(coords)):
                dist_matrix[i, j] = np.linalg.norm(coords[i, :] - coords[j, :])
                dist_matrix[j, i] = dist_matrix[i, j]
        return dist_matrix

    target_vector = []  # this will be a target vector (diagnosis)
    matrices = []  # this will be a list of connectomes
    all_files = sorted(os.listdir(path_to_read))
    matrix_files = [
        item for item in all_files if 'DTI_connectivity' in item and 'All' not in item]
    distance_files = [
        item for item in all_files if 'DTI_region_xyz_centers' in item and 'All' not in item]

    # for each file in a sorted (!) list of files:
    for filename in matrix_files:

        A_dataframe = pd.read_csv(
            path_to_read + filename, sep='   ', header=None, engine='python')
        A = A_dataframe.values  # we will use a list of numpy arrays, NOT pandas dataframes
        matrices.append(A)# append a matrix to our list
        if "ASD" in filename:
            target_vector.append(1)
        elif "TD" in filename:
            target_vector.append(0)
    asd_dict = {}
    asd_dict['X'] = np.array(matrices)
    asd_dict['y'] = np.array(target_vector)
    if distances:
        dist_matrix_list = []
        for item in distance_files:
            # print(item)
            cur_coord = get_autism_distances(path_to_read + item)
            cur_dist_mtx = get_distance_matrix(cur_coord)
            dist_matrix_list += [cur_dist_mtx]

        asd_dict['dist'] = np.array(dist_matrix_list)

    return asd_dict

In [3]:
import matrix_eig as me

## Best parametrs

In [4]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',  Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_eig_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_eig_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_eig_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_eig_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_eig_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_eig_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_eig_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_eig_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_eig_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_eig_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_eig_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_eig_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_eig_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_eig_260, collect=['X_vec']))]


selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR', LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
     LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    )
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table


Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR


In [5]:
result = pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                          scoring=['roc_auc'], results_file = "LR/lr.csv")

Removed previous results file -- LR/lr.csv.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15


In [7]:
result = pd.read_csv("LR/lr.csv")
result

Unnamed: 0.1,Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.557801,0.111239,"{'penalty': 'l1', 'C': 0.5, 'max_iter': 500}",0.505,0.129016,[ 0.5 0.64 0.36 0.5 0.55 0.4 0.75 0....
1,1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
2,2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR,0.528723,0.154005,"{'penalty': 'l1', 'C': 0.30000000000000004, 'm...",0.506,0.127765,[ 0.5 0.4 0.56 0.45 0.5 0.25 0.7 0....
3,3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR,0.560106,0.171505,"{'penalty': 'l1', 'C': 0.25, 'max_iter': 50}",0.550333,0.173477,[ 0.33333333 0.44 0.68 0.45 ...
4,4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR,0.664184,0.1873,"{'penalty': 'l1', 'C': 0.9, 'max_iter': 500}",0.632333,0.186321,[ 0.63333333 0.48 0.96 0.55 ...
5,5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR,0.655142,0.187667,"{'penalty': 'l1', 'C': 0.9, 'max_iter': 100}",0.576,0.194484,[ 0.6 0.52 0.84 0.5 0.15 0.4 0.8 0....
6,6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR,0.657092,0.155268,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.674333,0.140484,[ 0.63333333 0.64 0.92 0.6 ...
7,7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR,0.657624,0.108058,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.612667,0.111851,[ 0.56666667 0.6 0.76 0.6 ...
8,8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR,0.656206,0.104037,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.679667,0.161723,[ 0.56666667 0.68 1. 0.85 ...
9,9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR,0.63883,0.098162,"{'penalty': 'l1', 'C': 0.8500000000000001, 'ma...",0.658,0.151578,[ 0.5 0.6 0.88 0.7 0.4 0.6 0.9 0....


In [6]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',   Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_vec_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_vec_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_vec_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_vec_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_vec_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_vec_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_vec_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_vec_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_vec_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_vec_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_vec_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_vec_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_vec_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_vec_260, collect=['X_vec']))]


selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR', LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
     LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    )
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR


In [7]:
result = pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                          scoring=['roc_auc'], results_file = "LR/lr_vec_0.csv")

Removed previous results file -- LR/lr_vec_0.csv.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15


In [4]:
result = pd.read_csv("LR/lr_vec_0.csv")
result

Unnamed: 0.1,Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.553014,0.108203,"{'penalty': 'l1', 'C': 0.5, 'max_iter': 50}",0.495,0.132834,[ 0.5 0.64 0.36 0.5 0.5 0.35 0.75 0....
1,1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR,0.522872,0.195081,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.587667,0.15791,[ 0.46666667 0.4 0.76 0.5 ...
2,2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR,0.583333,0.155437,"{'penalty': 'l1', 'C': 0.45, 'max_iter': 100}",0.436333,0.174588,[ 0.43333333 0.28 0.6 0.2 ...
3,3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR,0.509574,0.19579,"{'penalty': 'l1', 'C': 1.0, 'max_iter': 50}",0.340333,0.159481,[ 0.33333333 0.44 0.48 0.05 ...
4,4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
5,5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR,0.608688,0.167712,"{'penalty': 'l1', 'C': 0.7000000000000001, 'ma...",0.470333,0.211788,[ 0.43333333 0.4 0.52 0.1 ...
6,6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
7,7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR,0.662766,0.132796,"{'penalty': 'l1', 'C': 0.30000000000000004, 'm...",0.666333,0.254652,[ 0.73333333 0.84 0.24 0.85 ...
8,8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
9,9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR,0.511702,0.165932,"{'penalty': 'l1', 'C': 0.2, 'max_iter': 50}",0.423333,0.189244,[ 0.53333333 0.32 0.28 0.4 ...


In [9]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',   Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_old_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_old_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_old_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_old_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_old_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_old_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_old_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_old_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_old_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_old_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_old_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_old_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_old_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_old_260, collect=['X_vec']))]


selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR', LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
     LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1', 'l2']
    )
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR


In [10]:
result = pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                          scoring=['roc_auc'], results_file = "LR/lr_old.csv")

Removed previous results file -- LR/lr_old.csv.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15


In [11]:
result = pd.read_csv("LR/lr_old.csv")
result

Unnamed: 0.1,Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.557801,0.111239,"{'penalty': 'l1', 'C': 0.5, 'max_iter': 50}",0.505,0.129016,[ 0.5 0.64 0.36 0.5 0.55 0.4 0.75 0....
1,1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
2,2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
3,3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
4,4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
5,5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
6,6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
7,7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
8,8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
9,9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...


In [15]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=100)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',   Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_vec_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_vec_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_vec_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_vec_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_vec_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_vec_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_vec_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_vec_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_vec_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_vec_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_vec_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_vec_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_vec_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_vec_260, collect=['X_vec']))]


selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR', LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
     LR=dict(
        C=[0.01, 0.05, 0.1] + [0.05*i for i in range(3, 21)],
        max_iter=[50, 100, 500],
        penalty=['l1']
    )
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR


In [16]:
result = pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                          scoring=['roc_auc'], results_file = "LR/lr_100.csv")

No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15


In [23]:
result = pd.read_csv("LR/lr_100.csv")
result

Unnamed: 0.1,Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR,0.642199,0.138808,"{'penalty': 'l1', 'C': 0.30000000000000004, 'm...",0.503333,0.142782,[ 0.63333333 0.56 0.24 0.55 ...
1,1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR,0.558333,0.255184,"{'penalty': 'l1', 'C': 1.0, 'max_iter': 50}",0.580333,0.144041,[ 0.53333333 0.4 0.72 0.6 ...
2,2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR,0.532979,0.210566,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.509,0.153392,[ 0.4 0.44 0.6 0.4 0.25 0.8 0.45 0....
3,3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR,0.511702,0.149365,"{'penalty': 'l1', 'C': 0.8500000000000001, 'ma...",0.360667,0.189993,[ 0.46666667 0.48 0.56 0.05 ...
4,4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
5,5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR,0.501418,0.213084,"{'penalty': 'l1', 'C': 0.9500000000000001, 'ma...",0.483333,0.195789,[ 0.53333333 0.4 0.6 0.1 ...
6,6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
7,7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR,0.574113,0.174913,"{'penalty': 'l1', 'C': 0.45, 'max_iter': 100}",0.652667,0.212691,[ 0.86666667 0.72 0.24 0.7 ...
8,8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR,0.55461,0.226214,"{'penalty': 'l1', 'C': 0.55, 'max_iter': 50}",0.406333,0.193979,[ 0.53333333 0.6 0.48 0.35 ...
9,9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR,0.5,0.0,"{'penalty': 'l1', 'C': 0.01, 'max_iter': 50}",0.5,0.0,[ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...


## Train

In [30]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',  Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_eig_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_eig_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_eig_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_eig_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_eig_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_eig_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_eig_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_eig_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_eig_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_eig_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_eig_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_eig_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_eig_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_eig_260, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR_orig', LogisticRegression()),
               ('LR_0',    LogisticRegression()),
               ('LR_20',   LogisticRegression()),
               ('LR_40',   LogisticRegression()),
               ('LR_60',   LogisticRegression()),
               ('LR_80',   LogisticRegression()),
               ('LR_100',  LogisticRegression()),
               ('LR_120',  LogisticRegression()),
               ('LR_140',  LogisticRegression()),
               ('LR_160',  LogisticRegression()),
               ('LR_180',  LogisticRegression()),
               ('LR_200',  LogisticRegression()),
               ('LR_220',  LogisticRegression()),
               ('LR_240',  LogisticRegression()),
               ('LR_260',  LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
    LR_orig = dict(penalty= ['l1'], C= [0.5],  max_iter= [50]),
    LR_0    = dict(penalty= ['l1'], C= [0.3],  max_iter= [500]),
    LR_20   = dict(penalty= ['l1'], C= [0.3],  max_iter= [500]),
    LR_40   = dict(penalty= ['l1'], C= [0.25], max_iter= [50]),
    LR_60   = dict(penalty= ['l1'], C= [0.2],  max_iter= [50]),
    LR_80   = dict(penalty= ['l1'], C= [0.9],  max_iter= [500]),
    LR_100  = dict(penalty= ['l1'], C= [0.9],  max_iter= [100]),
    LR_120  = dict(penalty= ['l1'], C= [0.95], max_iter= [50]),
    LR_140  = dict(penalty= ['l1'], C= [0.95], max_iter= [500]),
    LR_160  = dict(penalty= ['l1'], C= [0.85], max_iter= [100]),
    LR_180  = dict(penalty= ['l1'], C= [0.9],  max_iter= [100]),
    LR_200  = dict(penalty= ['l1'], C= [0.4],  max_iter= [100]),
    LR_220  = dict(penalty= ['l1'], C= [0.8],  max_iter= [100]),
    LR_240  = dict(penalty= ['l1'], C= [0.8],  max_iter= [100]),
    LR_260  = dict(penalty= ['l2'], C= [0.3],  max_iter= [50]),
    )

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

In [5]:
banned_combos = []

In [6]:
k = [20*i for i in range(0, 14)]
col = result.as_matrix(["Featurizers"])
col_f = np.delete(np.hstack(col), 0)

for n,fet in enumerate(col_f):
    k_tmp = copy.copy(k)
    k_tmp.remove(k[n])
    for i in k_tmp: banned_combos.append((fet, 'LR_' + str(i)))
        
for i in k: banned_combos.append(('origF', 'LR_' + str(i)))
    
for fet in col_f: banned_combos.append((fet, 'LR_orig'))

In [33]:
pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR_orig
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR_0
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR_20
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR_40
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR_60
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR_80
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR_100
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR_120
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR_140
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR_160


In [None]:
num_epoch = 100

for epoch in range(11, num_epoch):

    grid_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=0)

    eval_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=epoch)

    
    
    pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
    #pipe.plan_table
    
    print '#{}'.format(epoch)
    pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                 scoring=['roc_auc'], results_file = "LR/Ver1.0/result_" + str(epoch) + ".csv")

#11
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#12
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#13
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#14
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#15
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Li

In [7]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF',  Transformer(me.orig_vec,       collect=['X_vec'])),
               ('low0',   Transformer(me.matrix_vec_0,   collect=['X_vec'])),
               ('low20',  Transformer(me.matrix_vec_20,  collect=['X_vec'])),
               ('low40',  Transformer(me.matrix_vec_40,  collect=['X_vec'])),
               ('low60',  Transformer(me.matrix_vec_60,  collect=['X_vec'])),
               ('low80',  Transformer(me.matrix_vec_80,  collect=['X_vec'])),
               ('low100', Transformer(me.matrix_vec_100, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_vec_120, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_vec_140, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_vec_160, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_vec_180, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_vec_200, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_vec_220, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_vec_240, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_vec_260, collect=['X_vec']))]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('LR_orig', LogisticRegression()),
               ('LR_0',    LogisticRegression()),
               ('LR_20',   LogisticRegression()),
               ('LR_40',   LogisticRegression()),
               ('LR_60',   LogisticRegression()),
               ('LR_80',   LogisticRegression()),
               ('LR_100',  LogisticRegression()),
               ('LR_120',  LogisticRegression()),
               ('LR_140',  LogisticRegression()),
               ('LR_160',  LogisticRegression()),
               ('LR_180',  LogisticRegression()),
               ('LR_200',  LogisticRegression()),
               ('LR_220',  LogisticRegression()),
               ('LR_240',  LogisticRegression()),
               ('LR_260',  LogisticRegression())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
    LR_orig = dict(penalty= ['l1'], C= [0.5],  max_iter= [50]),
    LR_0    = dict(penalty= ['l1'], C= [1.0],  max_iter= [50]),
    LR_20   = dict(penalty= ['l1'], C= [0.95], max_iter= [500]),
    LR_40   = dict(penalty= ['l1'], C= [0.85], max_iter= [100]),
    LR_60   = dict(penalty= ['l1'], C= [0.2],  max_iter= [50]),#
    LR_80   = dict(penalty= ['l1'], C= [0.95], max_iter= [500]),
    LR_100  = dict(penalty= ['l1'], C= [0.9],  max_iter= [100]),#
    LR_120  = dict(penalty= ['l1'], C= [0.4],  max_iter= [50]),
    LR_140  = dict(penalty= ['l1'], C= [0.5],  max_iter= [50]),#
    LR_160  = dict(penalty= ['l1'], C= [0.2],  max_iter= [50]),
    LR_180  = dict(penalty= ['l1'], C= [0.2],  max_iter= [50]),
    LR_200  = dict(penalty= ['l1'], C= [0.4],  max_iter= [50]),
    LR_220  = dict(penalty= ['l1'], C= [0.4],  max_iter= [50]),#
    LR_240  = dict(penalty= ['l1'], C= [0.4],  max_iter= [50]),#
    LR_260  = dict(penalty= ['l2'], C= [0.3],  max_iter= [50]),
    )

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

In [8]:
pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,LR_orig
1,UCLAsource,binar,origN,low0,var_threshold,minmax,LR_0
2,UCLAsource,binar,origN,low20,var_threshold,minmax,LR_20
3,UCLAsource,binar,origN,low40,var_threshold,minmax,LR_40
4,UCLAsource,binar,origN,low60,var_threshold,minmax,LR_60
5,UCLAsource,binar,origN,low80,var_threshold,minmax,LR_80
6,UCLAsource,binar,origN,low100,var_threshold,minmax,LR_100
7,UCLAsource,binar,origN,low120,var_threshold,minmax,LR_120
8,UCLAsource,binar,origN,low140,var_threshold,minmax,LR_140
9,UCLAsource,binar,origN,low160,var_threshold,minmax,LR_160


In [None]:
num_epoch = 50

for epoch in range(10, num_epoch):

    grid_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=0)

    eval_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=epoch)

    
    
    pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
    #pipe.plan_table
    
    print '#{}'.format(epoch)
    pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                 scoring=['roc_auc'], results_file = "LR/Ver1.1/result_" + str(epoch) + ".csv")

#10
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#11
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#12
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#13
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Line: 15/15
#14
No previous results found.
Line: 1/15
Line: 2/15
Line: 3/15
Line: 4/15
Line: 5/15
Line: 6/15
Line: 7/15
Line: 8/15
Line: 9/15
Line: 10/15
Line: 11/15
Line: 12/15
Line: 13/15
Line: 14/15
Li

In [None]:
num_epoch = 70

for epoch in range(50, num_epoch):

    grid_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=0)

    eval_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=epoch)

    
    
    pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
    #pipe.plan_table
    
    print '#{}'.format(epoch)
    pipe.get_results('../Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                 scoring=['roc_auc'], results_file = "LR/Ver1.1/result_" + str(epoch) + ".csv")