# Libraries&Function 

In [1]:
from sklearn.preprocessing import FunctionTransformer

from reskit.norms import binar_norm, wbysqdist
from reskit.norms import spectral_norm

from reskit.features import degrees,  pagerank

from reskit.core import Transformer, Pipeliner

from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier 
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import os
import pandas as pd
import numpy as np

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def orig(x):
    return x

import copy

In [2]:
def get_autism(path_to_read='Data/dti/', distances=True):
    def get_autism_distances(loc_name):
        with open(loc_name, 'r') as f:
            read_data = f.readlines()

        read_data = pd.DataFrame(
            np.array([np.array(item[:-1].split()).astype(int) for item in read_data]))

        return read_data

    def get_distance_matrix(coords):
        if type(coords) == pd.core.frame.DataFrame:
            coords = coords.values
        elif type(coords) != np.ndarray:
            print('Provide either pandas df or numpy array!')
            return -1

        shape = len(coords)
        dist_matrix = np.zeros((shape, shape))
        del shape
        for i in range(len(coords)):
            for j in range(i + 1, len(coords)):
                dist_matrix[i, j] = np.linalg.norm(coords[i, :] - coords[j, :])
                dist_matrix[j, i] = dist_matrix[i, j]
        return dist_matrix

    target_vector = []  # this will be a target vector (diagnosis)
    matrices = []  # this will be a list of connectomes
    all_files = sorted(os.listdir(path_to_read))
    matrix_files = [
        item for item in all_files if 'DTI_connectivity' in item and 'All' not in item]
    distance_files = [
        item for item in all_files if 'DTI_region_xyz_centers' in item and 'All' not in item]

    # for each file in a sorted (!) list of files:
    for filename in matrix_files:

        A_dataframe = pd.read_csv(
            path_to_read + filename, sep='   ', header=None, engine='python')
        A = A_dataframe.values  # we will use a list of numpy arrays, NOT pandas dataframes
        matrices.append(A)# append a matrix to our list
        if "ASD" in filename:
            target_vector.append(1)
        elif "TD" in filename:
            target_vector.append(0)
    asd_dict = {}
    asd_dict['X'] = np.array(matrices)
    asd_dict['y'] = np.array(target_vector)
    if distances:
        dist_matrix_list = []
        for item in distance_files:
            # print(item)
            cur_coord = get_autism_distances(path_to_read + item)
            cur_dist_mtx = get_distance_matrix(cur_coord)
            dist_matrix_list += [cur_dist_mtx]

        asd_dict['dist'] = np.array(dist_matrix_list)

    return asd_dict

import matrix_eig as me

# Train mosters
## SVC
### Chose the best model for each k

In [4]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF', Transformer(me.orig_vec, collect=['X_vec'])),
               ('low0', Transformer(me.matrix_eig_0, collect=['X_vec'])),
               ('low5', Transformer(me.matrix_eig_5, collect=['X_vec'])),
               ('low10', Transformer(me.matrix_eig_10, collect=['X_vec'])),
               ('low20', Transformer(me.matrix_eig_20, collect=['X_vec'])),
               ('low30', Transformer(me.matrix_eig_30, collect=['X_vec'])),
               ('low40', Transformer(me.matrix_eig_40, collect=['X_vec'])),
               ('low50', Transformer(me.matrix_eig_50, collect=['X_vec'])),
               ('low60', Transformer(me.matrix_eig_60, collect=['X_vec'])),
               ('low70', Transformer(me.matrix_eig_70, collect=['X_vec'])),
               ('low80', Transformer(me.matrix_eig_80, collect=['X_vec'])),
               ('low90', Transformer(me.matrix_eig_90, collect=['X_vec'])),
               ('low100', Transformer(me.matrix_eig_100, collect=['X_vec'])),
               ('low110', Transformer(me.matrix_eig_110, collect=['X_vec'])),
               ('low120', Transformer(me.matrix_eig_120, collect=['X_vec'])),
               ('low130', Transformer(me.matrix_eig_130, collect=['X_vec'])),
               ('low140', Transformer(me.matrix_eig_140, collect=['X_vec'])),
               ('low150', Transformer(me.matrix_eig_150, collect=['X_vec'])),
               ('low160', Transformer(me.matrix_eig_160, collect=['X_vec'])),
               ('low170', Transformer(me.matrix_eig_170, collect=['X_vec'])),
               ('low180', Transformer(me.matrix_eig_180, collect=['X_vec'])),
               ('low190', Transformer(me.matrix_eig_190, collect=['X_vec'])),
               ('low200', Transformer(me.matrix_eig_200, collect=['X_vec'])),
               ('low210', Transformer(me.matrix_eig_210, collect=['X_vec'])),
               ('low220', Transformer(me.matrix_eig_220, collect=['X_vec'])),
               ('low230', Transformer(me.matrix_eig_230, collect=['X_vec'])),
               ('low240', Transformer(me.matrix_eig_240, collect=['X_vec'])),
               ('low250', Transformer(me.matrix_eig_250, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_eig_260, collect=['X_vec'])),
               ('low263', Transformer(me.matrix_eig_263, collect=['X_vec']))
              ]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('SVC', SVC())]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
     SVC=dict(
        C=[0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
        degree=[2, 3, 4],
        kernel=['linear', 'poly', 'rbf', 'sigmoid'],
        max_iter=[50, 100],
    )
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,SVC
1,UCLAsource,binar,origN,low0,var_threshold,minmax,SVC
2,UCLAsource,binar,origN,low5,var_threshold,minmax,SVC
3,UCLAsource,binar,origN,low10,var_threshold,minmax,SVC
4,UCLAsource,binar,origN,low20,var_threshold,minmax,SVC
5,UCLAsource,binar,origN,low30,var_threshold,minmax,SVC
6,UCLAsource,binar,origN,low40,var_threshold,minmax,SVC
7,UCLAsource,binar,origN,low50,var_threshold,minmax,SVC
8,UCLAsource,binar,origN,low60,var_threshold,minmax,SVC
9,UCLAsource,binar,origN,low70,var_threshold,minmax,SVC


In [None]:
result = pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                          scoring=['roc_auc'],  results_file = "Results/Ver3.2/result_1.csv")

No previous results found.
Line: 1/30


In [6]:
result

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers,grid_roc_auc_mean,grid_roc_auc_std,grid_roc_auc_best_params,eval_roc_auc_mean,eval_roc_auc_std,eval_roc_auc_scores
0,UCLAsource,binar,origN,origF,var_threshold,minmax,SVC,0.570745,0.180868,"{'kernel': 'linear', 'C': 0.001, 'max_iter': 1...",0.536,0.109471,[ 0.4 0.68 0.48 0.75 0.45 0.55 0.5 0....
1,UCLAsource,binar,origN,low0,var_threshold,minmax,SVC,0.515248,0.195799,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.538333,0.205276,[ 0.33333333 0.72 0.68 0.5 ...
2,UCLAsource,binar,origN,low5,var_threshold,minmax,SVC,0.506206,0.201152,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.512333,0.202177,[ 0.33333333 0.76 0.68 0.5 ...
3,UCLAsource,binar,origN,low10,var_threshold,minmax,SVC,0.510638,0.212116,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.537333,0.190746,[ 0.33333333 0.76 0.68 0.5 ...
4,UCLAsource,binar,origN,low20,var_threshold,minmax,SVC,0.49734,0.192764,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.531333,0.19338,[ 0.33333333 0.76 0.72 0.5 ...
5,UCLAsource,binar,origN,low30,var_threshold,minmax,SVC,0.488121,0.196966,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.539667,0.220668,[ 0.36666667 0.8 0.68 0.5 ...
6,UCLAsource,binar,origN,low40,var_threshold,minmax,SVC,0.497163,0.199595,"{'kernel': 'poly', 'C': 0.1, 'max_iter': 50, '...",0.511333,0.197102,[ 0.33333333 0.72 0.56 0.45 ...
7,UCLAsource,binar,origN,low50,var_threshold,minmax,SVC,0.470922,0.21075,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.488667,0.196187,[ 0.36666667 0.72 0.6 0.4 ...
8,UCLAsource,binar,origN,low60,var_threshold,minmax,SVC,0.469858,0.209563,"{'kernel': 'poly', 'C': 0.001, 'max_iter': 50,...",0.552,0.204049,[ 0.4 0.8 0.72 0.55 0.35 0.75 0.25 0....
9,UCLAsource,binar,origN,low70,var_threshold,minmax,SVC,0.485638,0.219189,"{'kernel': 'poly', 'C': 0.3, 'max_iter': 50, '...",0.528333,0.211672,[ 0.33333333 0.72 0.68 0.5 ...


In [8]:
grid_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=0)

eval_cv = StratifiedKFold(n_splits=10,
                          shuffle=True,
                          random_state=1)

data = [('UCLAsource', Transformer(get_autism))]

weighters = [('binar', Transformer(binar_norm))]

normalizers = [('origN', Transformer(orig))]

featurizers = [('origF', Transformer(me.orig_vec, collect=['X_vec'])),
               ('low250', Transformer(me.matrix_eig_250, collect=['X_vec'])),
               ('low260', Transformer(me.matrix_eig_260, collect=['X_vec'])),
               ('low263', Transformer(me.matrix_eig_263, collect=['X_vec']))
              ]

selectors = [('var_threshold', VarianceThreshold())]

scalers = [('minmax', MinMaxScaler())]

classifiers = [('SVC_orig', SVC()),
               ('SVC_250', SVC()),
               ('SVC_260', SVC()),
               ('SVC_263', SVC()),
              ]

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

param_grid = dict(
    SVC_orig = dict(kernel= ['linear'], C= [0.001], max_iter= [100], degree= [2]),
    SVC_250  = dict(kernel= ['poly'], C= [0.1], max_iter= [50], degree= [4]),
    SVC_260  = dict(kernel= ['linear'], C= [0.3], max_iter= [100], degree= [2]),
    SVC_263  = dict(kernel= ['poly'], C= [0.1], max_iter= [50], degree= [3])
    )

banned_combos = []

steps = [('Data', data),
         ('Weighters', weighters),
         ('Normalizers', normalizers),
         ('Featurizers', featurizers),
         ('Selectors', selectors),
         ('Scalers', scalers),
         ('Classifiers', classifiers)]

In [9]:
k = [0, 5]+ [10*i for i in range(1, 27)] + [263]
col = result.as_matrix(["Featurizers"])
col_f = np.delete(np.hstack(col), 0)

for n,fet in enumerate(col_f):
    k_tmp = copy.copy(k)
    k_tmp.remove(k[n])
    for i in k_tmp: banned_combos.append((fet, 'SVC_' + str(i)))
        
for i in k: banned_combos.append(('origF', 'SVC_' + str(i)))
    
for fet in col_f: banned_combos.append((fet, 'SVC_orig'))

In [10]:
pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
pipe.plan_table

Unnamed: 0,Data,Weighters,Normalizers,Featurizers,Selectors,Scalers,Classifiers
0,UCLAsource,binar,origN,origF,var_threshold,minmax,SVC_orig
1,UCLAsource,binar,origN,low250,var_threshold,minmax,SVC_250
2,UCLAsource,binar,origN,low260,var_threshold,minmax,SVC_260
3,UCLAsource,binar,origN,low263,var_threshold,minmax,SVC_263


In [11]:
num_epoch = 30

for epoch in range(2, num_epoch):

    grid_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=0)

    eval_cv = StratifiedKFold(n_splits=10,
                              shuffle=True,
                              random_state=epoch)

    
    
    pipe = Pipeliner(steps, eval_cv=eval_cv, grid_cv=grid_cv, param_grid=param_grid, banned_combos=banned_combos)
    #pipe.plan_table
    
    pipe.get_results('Data/dti/', caching_steps=['Data', 'Weighters', 'Normalizers', 'Featurizers'],
                 scoring=['roc_auc'], results_file = "Results/Ver3.2/result_" + str(epoch) + ".csv")

No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: 4/4
No previous results found.
Line: 1/4
Line: 2/4
Line: 3/4
Line: