In [1]:
import numpy as np
import pandas as pd
from copy import copy
from pathlib import Path
from itertools import chain
from tqdm.notebook import tqdm

import data_tools.graphs as gt
from hetnet_ml.extractor import MatrixFormattedGraph, piecewise_extraction

  from tqdm.autonotebook import tqdm


In [2]:
nodes = pd.read_csv('../2_pipeline/11a_Model_Prep_Holdout_Set/out/nodes.csv', dtype=str)
edges = pd.read_csv('../2_pipeline/11a_Model_Prep_Holdout_Set/out/edges.csv', dtype=str)

mg = MatrixFormattedGraph(nodes, edges, 'ChemicalSubstance', 'Disease', max_length=4, w=0.4, n_jobs=30)

Processing node and edge data...
Initializing metagraph...
Generating adjacency matrices...


100%|██████████| 96/96 [02:02<00:00,  1.28s/it]



Determining degrees for each node and metaedge


100%|██████████| 96/96 [00:30<00:00,  3.18it/s]



Weighting matrices by degree with dampening factor 0.4...


100%|██████████| 96/96 [00:02<00:00, 42.02it/s]


## Strange behavior

For whatever reason, the imports below result in the above initizalizaiton of mg not finishing, so we've loaded mg first, then will continue the imports

In [3]:
from scipy.sparse import issparse, csc_matrix, csr_matrix

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel

from xgboost import XGBClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Data Transformer

We use an inverse hyperbolic sine transformation to transform the features.

The transormation is thus:

$$sinh^{-1} \left( \frac{X_{mp}}{\sigma_{mp}} \right)$$

where $X_{mp}$ is the column in the dwpc feature matrix $X$ coreesponding to metapath $mp$ and $\sigma_{mp}$ is the mean of the column.

In [4]:
class MeanScaledArcsinhTransformer(TransformerMixin):
        
    def fit(self, X, y=None):
        if issparse(X):
            self.initial_mean_ = X.tocoo().tocsc().mean(axis=0).A[0]
        else:
            self.initial_mean_ = X.mean(axis=0)

        # If input was DataFrame, Converts resultant series to ndarray
        try:
            self.initial_mean_ = self.initial_mean_.values
        except:
            pass
        
        # If inital mean == 0, likely all values were zero
        # this prevents issues later.
        self.initial_mean_[np.where(self.initial_mean_ == 0.0)] = 1
        
        return self

    def transform(self, X, y=None):
        if issparse(X):
            return np.arcsinh(X.tocoo().tocsc().multiply(self.initial_mean_**-1))
        return np.arcsinh(X / self.initial_mean_)


# Homegrown Feature Selector

Runs 6 analysis on the traning data to select features.

1. Correlation to the output
2. Chi_squared test
3. Recurssive Feature Elimantion on a Ridge Regressor
4. Embedded Feature Selection from a Lasso Regressor
5. Embedded Feature Selection from a Randomn Forest Classifier
6. Embedded Feature Selection from a Gradient Boosting Classifier

Each analysis will select `num_feats` best features. The selected features will then by chosen via a voting method with `min_selections` out of the 6 elements required to for a feature to be kept. 


We have also added an option for `always_keep`:  This allows for domain expertise to be factored into the feature selection process.  In our case, we know some metapaths are specifically mechanistic, so we want to include those wherever possible

In [5]:
def cor_selector(X, y, feature_names, num_feats):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in range(X.shape[1]):
        if issparse(X):
            x = X[:, i].A.reshape(len(y))
        else:
            x = X[:, i]
        cor = np.corrcoef(x, y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = np.array(feature_names)[np.argsort(np.abs(cor_list))[-num_feats:].tolist()].tolist()    
    # feature selection? 0 for not select, 1 for select
    return [True if i in cor_feature else False for i in feature_names]

def chi2_selector(X, y, num_feats):
    this_selector = SelectKBest(chi2, k=num_feats)
    this_selector.fit(X, y)
    return this_selector.get_support()

def rfe_selector(X, y, num_feats, random_state=None):
    this_selector = RFE(estimator=LogisticRegression(C=.1, solver='liblinear', random_state=random_state), 
                        n_features_to_select=num_feats, step=.2, verbose=5)
    this_selector.fit(X, y)
    return this_selector.get_support()

def embeded_lr_selector(X, y, num_feats, random_state=None):
    this_selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear', random_state=random_state), 
                                    max_features=num_feats)
    this_selector.fit(X, y)

    return this_selector.get_support()

def embeded_rf_selector(X, y, num_feats, n_jobs, random_state=None):
    rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=n_jobs, random_state=random_state)
    this_selector = SelectFromModel(rfc, max_features=num_feats)
    this_selector.fit(X, y)
    return this_selector.get_support()

def embeded_xgb_selector(X, y, num_feats, n_jobs=1, random_state=None):
    # XGBoost takes 0 as default random state
    if random_state is None:
        random_state = 0
    # Paramaters optimized for speed, rather than accuracy (as we have 5 other estimators also providing votes)
    xgbc = XGBClassifier(max_depth=5, n_estimators=200, learning_rate=.16, min_child_weight=1, colsample_bytree=.8,
                         n_jobs=n_jobs, random_state=random_state)
    this_selector = SelectFromModel(xgbc, max_features=num_feats)
    this_selector.fit(X, y)
    return this_selector.get_support()


class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_features=100, min_selections=4, n_jobs=1, feature_names=None, always_keep=None,
                 random_state=None):
        self.num_features = num_features
        self.min_selections = min_selections
        self.n_jobs = n_jobs
        self.feature_names = feature_names
        self.always_keep = always_keep
        self.random_state = random_state
        
    def fit(self, X, y=None):
                
        X_norm = MaxAbsScaler().fit_transform(X)
        if issparse(X):
            if type(X) != csc_matrix:
                X = X.tocsc()
            X_norm = X_norm.tocsc()
        
        print('Running Cor')
        cor_support = cor_selector(X, y, self.feature_names, self.num_features)
        print('Running Chi2')
        chi_support = chi2_selector(X_norm, y, self.num_features)
        print('Running RFE')
        rfe_support = rfe_selector(X_norm, y, self.num_features, self.random_state)
        print('Running LR')
        embeded_lr_support = embeded_lr_selector(X_norm, y, self.num_features, self.random_state)
        print('Running RF')
        embeded_rf_support = embeded_rf_selector(X, y, self.num_features, 
                                                 n_jobs=self.n_jobs, random_state=self.random_state)
        print('Running XG')
        embeded_xgb_support = embeded_xgb_selector(X, y, self.num_features, 
                                                   n_jobs=self.n_jobs, random_state=self.random_state)
        
        feature_selection_df = pd.DataFrame({'feature':self.feature_names, 'pearson':cor_support, 'chi_2':chi_support, 
                                             'rfe':rfe_support, 'logistics':embeded_lr_support,
                                             'random_forest':embeded_rf_support, 'xgboost':embeded_xgb_support})  

        feature_selection_df['total'] = np.sum(feature_selection_df, axis=1)
        self.feature_selection_df_ = feature_selection_df
        
        keep_features = feature_selection_df.query('total >= {}'.format(self.min_selections))['feature'].tolist()
        
        # Keep the features that we always want (e.g. domain expertise)
        if self.always_keep is not None:
            keep_features.extend(self.always_keep)
        
        self.keep_features_ = [f for f in self.feature_names if f in keep_features]
        
        return self
    
    def transform(self, X, y=None):
        
        if issparse(X) and type(X) != csc_matrix:
            X = X.tocsc()
        return X[:, [i for i, f in enumerate(self.feature_names) if f in self.keep_features_]]

In [6]:
def sparse_std(data, axis=1):                                                                              
    """take the standard deviation of a sparse matrix"""                  

    def get_vec_std(vec):                                                                                       
        return vec.A.std(ddof=1)                                                                                

    stds = []        
    
    # ensure the correct matrix type for easy row or column subsetting
    if axis==1 and type(data) != csc_matrix:
        data = data.tocoo().tocsc()
    if axis==0 and type(data) != csr_matrix:
        data = data.tocoo().tocsr()
    
    # Get the std for each vector along the given axis individually
    for i in range(data.shape[axis]):                                                                              
        if axis==1:
            stds.append(get_vec_std(data.getcol(i)))       
        elif axis==0:
            stds.append(get_vec_std(data.getrow(i)))       
        
    return np.array(stds)

In [7]:
def get_model_coefs(model, X, f_names):                                                                                 
    """Helper Function to quickly return the model coefs and correspoding fetaure names"""                              
                                                                                                                        
    # Ensure we have a numpy array for the features                                                                     
    if type(X) == pd.DataFrame:                                                                                         
        X = X.values
        
                                                                                                                        
    # Grab the coeffiencts                                                                                              
    coef = model.coef_                                                                                                  
    # Some models return a double dimension array, others only a single                                                 
    if len(coef) != len(f_names):                                                                                       
        coef = coef[0]                                                                                                  
                                                                                                                        
    # insert the intercept                                                                                              
    coef = np.insert(coef, 0, model.intercept_)                                                                         
    names = np.insert(f_names, 0, 'intercept')                                                                          
                                                                                                                        
    # Calculate z-score scaled coefficients based on the features                                                       
    if issparse(X):
        if type(X) != csc_matrix:
            X = X.tocoo().tocsc()
        z_intercept = coef[0] + sum(coef[1:] * X.mean(axis=0).A[0])
        z_coef = coef[1:] * sparse_std(X, axis=1)
        z_coef = np.insert(z_coef, 0, z_intercept)
    else:
        z_intercept = coef[0] + sum(coef[1:] * X.mean(axis=0))                                                              
        z_coef = coef[1:] * X.std(axis=0)                                                                                   
        z_coef = np.insert(z_coef, 0, z_intercept)                                                                          
                                                                                                                        
    # Return                                                                                                            
    return pd.DataFrame([names, coef, z_coef]).T.rename(columns={0:'feature', 1:'coef', 2:'zcoef'})    

In [8]:
float_size = 32 # bits
bits_per_gb = 8589934592

def print_mem_info(n_comp, n_dis, n_mps):
    print("{:,} Compounds * {:,} Diseases = {:,} C-D Pairs".format(n_comp, n_dis,
                                                                   n_comp * n_dis))
    print("{:,} C-D Pairs * {:,} Metapaths = {:,} Matrix Values".format(n_comp * n_dis,
                                                                    n_mps, 
                                                                    n_comp * n_dis * n_mps))

    print('{:1,.1f} GB of matrix values'.format(n_comp * n_dis * n_mps * float_size / (bits_per_gb)))
    
    print('{:1,.3f} GB per metapath'.format(n_comp * n_dis * float_size / (bits_per_gb)))

In [9]:
train_frac = 0.15
rs = 20200123


treat_comps = set(edges.query('type == "treats_CtD"')['start_id'])
# Sample the negatives and subsample 
keep_comps = set(nodes.query('id not in @treat_comps and label == "ChemicalSubstance"')
                      .sample(frac=train_frac*.01, random_state=rs)['id'])
# Then subsample the positives
keep_comps = keep_comps | set(nodes.query('id in @treat_comps')
                                   .sample(frac=train_frac, random_state=rs+1)['id'])

treat_dis = set(edges.query('type == "treats_CtD"')['end_id'])
# Sample the negatives and subsample cv
keep_dis = set(nodes.query('label == "Disease" and id not in @treat_dis')
                    .sample(frac=train_frac*.01, random_state=rs+2)['id'])
# Take the diseases Treated by these compounds
keep_dis = keep_dis | set(edges.query('type == "treats_CtD" and start_id in @keep_comps')['end_id'])

In [10]:
all_mp_counts = pd.read_csv('../2_pipeline/11b_Model_Prep_Metapath_Membership_Analysis/out/all_mp_counts.csv')
all_mp_counts.head(2)

Unnamed: 0,mp,pair_count,subset,frac,sim_mp
0,CaAawD,19950,all_pairs,4e-05,False
1,CaAsoD,0,all_pairs,0.0,False


In [11]:
non_sim_names = all_mp_counts.query('sim_mp == False')['mp'].unique().tolist()
mp_qr = all_mp_counts.query('subset == "all_pairs" and mp in @non_sim_names and pair_count > 0')
good_mps = mp_qr['mp'].tolist()

In [12]:
print_mem_info(len(keep_comps), len(keep_dis), len(good_mps))

1,292 Compounds * 2,903 Diseases = 3,750,676 C-D Pairs
3,750,676 C-D Pairs * 7,303 Metapaths = 27,391,186,828 Matrix Values
102.0 GB of matrix values
0.014 GB per metapath


In [13]:
print('{:,} Positive training examples in subset'.format(len(edges.query('start_id in @keep_comps and type == "treats_CtD"'))))

8,305 Positive training examples in subset


### Bring in the known metapaths found in DrugMechDB

In [14]:
mech_mps = pd.read_csv('../0_data/manual/mech_mps.txt', header=None)[0].values
dmdb_feat = set(all_mp_counts.query('mp in @mech_mps and subset == "all_pairs" and pair_count > 0 and sim_mp == False')['mp'])

len(dmdb_feat)

34

# Extract the features one time

Use the pair counts to sort metapaths extaction as a niave load balancer

In [15]:
def sort_mps_for_pw_extraction(n_big_calcs, big_per_block, mp_list, frac_info):
    
    big_mp = frac_info.sort_values('frac', ascending=False).head(n_big_calcs)['mp'].tolist()
    other_mp = list(set(mp_list) - set(big_mp))

    block_size = len(other_mp) // (len(big_mp) // big_per_block)
    n_blocks = ((len(big_mp) + len(other_mp)) // block_size)

    out = []
    for i in range(n_blocks):
        for j in range(big_per_block):
            idx = i*big_per_block + j
            out.append(big_mp[idx])
        out += other_mp[i*block_size:(i+1)*block_size] 
        
    out += list(set(other_mp) - set(out))    
    
    return out, block_size

In [16]:
# 30, 5 finishes < 30 min.
# 40, 8 finishes 18min 46s
to_xtract, block_size = sort_mps_for_pw_extraction(40, 8, good_mps, mp_qr)

In [17]:
len(to_xtract), block_size

(7303, 1452)

In [18]:
%%time
## Extract the metapaths to do some prep work
(pairs, feats), test_dwpc = piecewise_extraction(function=mg.extract_dwpc, 
                                 to_split='metapaths', block_size=block_size,
                                 axis=1,
                                 metapaths=to_xtract, 
                                 start_nodes=list(keep_comps), 
                                 end_nodes=list(keep_dis),
                                 return_sparse=True,
                                 sparse_df=False,
                                 n_jobs=30)

100%|██████████| 6/6 [19:57<00:00, 199.58s/it]


CPU times: user 5min 21s, sys: 1min 3s, total: 6min 25s
Wall time: 19min 59s


# Training Targets for feature selection¶

In [19]:
pos_tups = edges.query('type == "treats_CtD"')[['start_id', 'end_id']].apply(tuple, axis=1).tolist()
pos_tups[:2]

[('CHEBI:100147', 'DOID:0050400'), ('CHEBI:100147', 'DOID:13148')]

In [20]:
pos_tups = set(pos_tups)

In [21]:
y = []
for row in tqdm(pairs.itertuples(), total=len(pairs)):
    if set([(row.chemicalsubstance_id, row.disease_id)]) & pos_tups:
        y.append(1)
    else:
        y.append(0)

y = np.array(y)
        
print(len(pairs), len(y), sum(y))

HBox(children=(FloatProgress(value=0.0, max=3750676.0), HTML(value='')))


3750676 3750676 8305


In [22]:
pairs['status'] = y

# Subsample the potnetial training examples

Our class is so imbalanced, to get a sizeable number of positive training examples, we end up with many orders of magnitude more negative examples.  Many of those examples will have no connections from the compound to the disease of interest, this providing a zero row in the matrix. We will not waste time training on those values, and instead focus on the ones that distinguish the positive from the negative training examples.

As this is just hyperparameter tunings, to speed things up, we will also limit the positive examples to a small portion of the negative examples, 100x (2 orders of magnitude) larger than the number of postitives. 

In [23]:
# Get the rows that are Non-zero
nz_index = pairs[test_dwpc.getnnz(1)>0].index

# have the number of postivies to get 100x this for the negatives.
n_pos = pairs['status'].sum()

# Sample the nonzero negative examples at a rate of 100x the positive samples
neg_index = pairs.loc[nz_index].query('status == 0').sample(n=100*n_pos, random_state=rs+10).sort_index().index

# and of course take the training postivies
pos_index = pairs.query('status == 1').index

# Union the two
train_index = pos_index.union(neg_index)

In [24]:
feats = np.array(feats)
nz_feats = feats[test_dwpc.getnnz(0)>0]
feat_index = test_dwpc.getnnz(0)>0

In [25]:
# Get our compounds as ndarrays for easy indexing with sklearns StratifiedKFold
keep_comps = np.array(list(keep_comps))
# Need to know how to properly stratify the split
is_treat_comp = np.array([1 if c in treat_comps else 0 for c in keep_comps])

## Select the features

This is a time consuming and costly step. We will do once with the initial DWPC for parameter tuning. We will perform again at the end with the selected parameters.

In [26]:
%%time
msat = MeanScaledArcsinhTransformer()
trans_dwpc = msat.fit_transform(test_dwpc[train_index][:, feat_index])

CPU times: user 43 s, sys: 10.9 s, total: 53.9 s
Wall time: 22.2 s


In [27]:
%%time
fsel = FeatureSelector(num_features=500, min_selections=4, n_jobs=30, 
                       feature_names=nz_feats.tolist(), always_keep=dmdb_feat, random_state=rs+5)
sel_dwpc = fsel.fit_transform(trans_dwpc, y[train_index])

Running Cor


  c /= stddev[:, None]
  c /= stddev[None, :]


Running Chi2


  chisq /= f_exp


Running RFE
Fitting estimator with 5607 features.
Fitting estimator with 4486 features.
Fitting estimator with 3365 features.
Fitting estimator with 2244 features.
Fitting estimator with 1123 features.
Running LR
Running RF
Running XG
CPU times: user 2h 47min 42s, sys: 3min 10s, total: 2h 50min 53s
Wall time: 15min 45s


In [28]:
%%time
e_net = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=500, 
                           **{'l1_ratio': 0.10455936193818496, 'C': 0.2}, random_state=rs+6)
e_net.fit(sel_dwpc, y[train_index])

CPU times: user 59.4 s, sys: 348 ms, total: 59.8 s
Wall time: 52.9 s


# Prepare output location

In [29]:
this_name = '11c_Model_Prep_Hyperparam_tuning'

out_dir = Path('../2_pipeline').joinpath(this_name, 'out').resolve()
tmp_dir = out_dir.parent.joinpath('tmp')

out_dir.mkdir(parents=True, exist_ok=True)
tmp_dir.mkdir(parents=True, exist_ok=True)

In [34]:
all_dwpc = dict()

def get_dwpc(w):
    global all_dwpc
    
    # only extract if not previously extracted
    if w not in all_dwpc.keys():
        mg.update_w(w)
        (pairs, feats), dwpc = piecewise_extraction(function=mg.extract_dwpc, 
                                 to_split='metapaths', block_size=block_size,
                                 axis=1,
                                 metapaths=to_xtract, 
                                 start_nodes=list(keep_comps), 
                                 end_nodes=list(keep_dis),
                                 return_sparse=True,
                                 sparse_df=False,
                                 n_jobs=30)
        # next step is split, so need sparse rows
        dwpc = dwpc.tocoo().tocsr()
        #all_dwpc[w] = dwpc
        return dwpc
    else:
        return all_dwpc[w]

In [31]:
import pickle
from time import time

def hyperopt(param_space, y, feats, num_eval):

    def objective_function(params):
        dwpc_params = {k.split('__')[1]: v for k, v in params.items() if k.split('__')[0] == 'dwpc'}
        enet_params = {k.split('__')[1]: v for k, v in params.items() if k.split('__')[0] == 'enet'}

        this_w = dwpc_params['w']

        # Set up the post feature extraction pipeline
        post_extraction_pipeline = Pipeline(
            [('transformer', MeanScaledArcsinhTransformer()),
             ('maxabs_scale', MaxAbsScaler()),
             ('e_net', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=100, **enet_params,
                                          random_state=rs+6))], verbose=True)

        ## Get the dwpc information for the current pairs
        dwpc = get_dwpc(this_w)
        this_dwpc = dwpc[train_index][:, feat_index]
        this_dwpc = fsel.transform(this_dwpc)

        cv = cross_validate(post_extraction_pipeline, this_dwpc, y, cv=5, 
                            scoring=['average_precision', 'roc_auc'], return_estimator=True)

        # Write out scores for each run
        with open(tmp_dir.joinpath('scores_w_{0:1.4f}_C_{1:1.5f}_l1_{2:1.4f}.txt'.format(
                                   this_w, enet_params['C'], enet_params['l1_ratio'])), 'w') as f_out:

            f_out.write(', '.join([str(s) for s in cv['test_average_precision']]))
            f_out.write('\n')
            f_out.write(', '.join([str(s) for s in cv['test_roc_auc']]))
            f_out.write('\n')

        score = cv['test_average_precision'].mean()
        print('Mean Score: {:1.4f}'.format(score))
        
        return {'loss': -1*score, 'status': STATUS_OK}

    start = time()
   
    
    trials = Trials()
    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(1))
    
    print(time() - start)
    return trials, best_param
    

In [32]:
# Previous best before adding new keep_features: 
# '1l_ratio': 0.10455936193818496, 'C': 0.000556880900960339, 'w': 0.2640929485381926
param_hyperopt = {
    'dwpc__w': hp.uniform('w', 0.01, 1),
    'enet__C': hp.loguniform('C', np.log(0.0001), np.log(.2)),
    'enet__l1_ratio': hp.uniform('l1_ratio', .01, .99),
}

In [None]:
trials, best_param = hyperopt(param_hyperopt, y[train_index], to_xtract, 50)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
[A                                                   Changing w from 0.7056185425199153 to 0.2471551211320533. Please wait...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A

  0%|          | 0/96 [00:00<?, ?it/s]
[A[A
  2%|2         | 2/96 [00:00<00:11,  8.22it/s]
[A[A
 10%|#         | 10/96 [00:00<00:07, 11.25it/s]
[A[A
 18%|#7        | 17/96 [00:00<00:05, 14.88it/s]
[A[A
 27%|##7       | 26/96 [00:00<00:03, 19.46it/s]
[A[A
 35%|###5      | 34/96 [00:00<00:02, 25.06it/s]
[A[A
 56%|#####6    | 54/96 [00:00<00:01, 33.81it/s]
[A[A
 70%|######9   | 67/96 [00:00<00:00, 43.41it/s]
[A[A
 81%|########1 | 78/96 [00:01<00:00, 50.48it/s]
[A[A
 92%|#########1| 88/96 [00:01<00:00, 50.47it/s]
[A[A
100%|##########| 96/96 [00:01<00:00, 50.47it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
[A[A
 17%|#6        | 1/6 [01:06<05:34, 66.95s/it]
[A[A
 33%|###3      | 2/6 [04:50<07:35, 113.89s/it]
[A[A
 50%|#####     | 3/6 [08:29<07:16, 145.45s/it]
[A[A
 67%|######6   | 4/6 [12:12<05:37, 168.85s/it]
[A[A
 83%|########3 | 5/6 [18:04<03:43, 223.65s/it]
[A[A
100%|##########| 6/6 [21:21<00:00, 215.60s/it]
[A[A
100%|##########| 6/6 [21:21<00:00, 213.54s/i


[A                                                   [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.8s

  0%|          | 0/50 [22:16<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.2s

  0%|          | 0/50 [22:19<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.6min

  0%|          | 0/50 [23:57<?, ?trial/s, best loss=?][A





[A                                                   [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.7s

  0%|          | 0/50 [24:02<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   1.9s

  0%|          | 0/50 [24:03<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.6min

  0%|          | 0/50 [25:41<?, ?trial/s, best loss=?][A





[A                                                   [Pipeline] ....... (step 1 of 3) Processing transformer, total=   3.0s

  0%|          | 0/50 [25:46<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.4s

  0%|          | 0/50 [25:49<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.7min

  0%|          | 0/50 [27:30<?, ?trial/s, best loss=?][A





[A                                                   [Pipeline] ....... (step 1 of 3) Processing transformer, total=   3.3s

  0%|          | 0/50 [27:35<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.8s

  0%|          | 0/50 [27:37<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.7min

  0%|          | 0/50 [29:22<?, ?trial/s, best loss=?][A





[A                                                   [Pipeline] ....... (step 1 of 3) Processing transformer, total=   3.0s

  0%|          | 0/50 [29:26<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.3s

  0%|          | 0/50 [29:28<?, ?trial/s, best loss=?][A
[A                                                   [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.7min

  0%|          | 0/50 [31:09<?, ?trial/s, best loss=?][A





[A                                                   Mean Score: 0.0831

  0%|          | 0/50 [31:10<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [31:10<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    Changing w from 0.2471551211320533 to 0.7056185425199153. Please wait...

  2%|▏         | 1/50 [31:10<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A

  0%|          | 0/96 [00:00<?, ?it/s]
[A[A
  2%|2         | 2/96 [00:00<00:12,  7.75it/s]
[A[A
  5%|5         | 5/96 [00:00<00:09,  9.45it/s]
[A[A
 19%|#8        | 18/96 [00:00<00:06, 12.94it/s]
[A[A
 27%|##7       | 26/96 [00:00<00:04, 17.24it/s]
[A[A
 36%|###6      | 35/96 [00:00<00:02, 22.31it/s]
[A[A
 46%|####5     | 44/96 [00:00<00:01, 28.68it/s]
[A[A
 60%|######    | 58/96 [00:00<00:01, 37.62it/s]
[A[A
 73%|#######2  | 70/96 [00:01<00:00, 45.93it/s]
[A[A
 82%|########2 | 79/96 [00:01<00:00, 53.39it/s]
[A[A
 92%|#########1| 88/96 [00:01<00:00, 60.61it/s]
[A[A
100%|##########| 96/96 [00:02<00:00, 47.40it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
[A[A
 17%|#6        | 1/6 [01:07<05:38, 67.61s/it]
[A[A
 33%|###3      | 2/6 [04:46<07:31, 112.88s/it]
[A[A
 50%|#####     | 3/6 [08:28<07:17, 145.71s/it]
[A[A
 67%|######6   | 4/6 [12:09<05:36, 168.30s/it]
[A[A
 83%|########3 | 5/6 [18:02<03:43, 223.78s/it]
[A[A
100%|##########| 6/6 [21:20<00:00, 216.04s/i


[A                                                                                    [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.8s

  2%|▏         | 1/50 [53:29<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.1s

  2%|▏         | 1/50 [53:31<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    [Pipeline] ............. (step 3 of 3) Processing e_net, total= 1.7min

  2%|▏         | 1/50 [55:12<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A





[A                                                                                    [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.7s

  2%|▏         | 1/50 [55:17<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.0s

  2%|▏         | 1/50 [55:19<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    [Pipeline] ............. (step 3 of 3) Processing e_net, total=  24.9s

  2%|▏         | 1/50 [55:43<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A
[A                                                                                    [Pipeline] ....... (step 1 of 3) Processing transformer, total=   3.0s

  2%|▏         | 1/50 [55:49<25:27:46, 1870.75s/trial, best loss: -0.08311038010471776][A

  0%|          | 0/96 [00:00<?, ?it/s]
[A[A
  1%|1         | 1/96 [00:00<00:17,  5.38it/s]
[A[A
  2%|2         | 2/96 [00:00<00:18,  4.99it/s]
[A[A
 11%|#1        | 11/96 [00:00<00:12,  6.96it/s]
[A[A
 19%|#8        | 18/96 [00:00<00:08,  9.32it/s]
[A[A
 26%|##6       | 25/96 [00:00<00:05, 12.42it/s]
[A[A
 35%|###5      | 34/96 [00:00<00:03, 16.49it/s]
[A[A
 48%|####7     | 46/96 [00:01<00:02, 22.23it/s]
[A[A
 64%|######3   | 61/96 [00:01<00:01, 29.73it/s]
[A[A
 76%|#######6  | 73/96 [00:01<00:00, 38.11it/s]
[A[A
 90%|########9 | 86/96 [00:01<00:00, 44.80it/s]
[A[A
100%|##########| 96/96 [00:02<00:00, 25.27it/s]
[A[A
100%|##########| 96/96 [00:02<00:00, 43.07it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
[A[A
 17%|#6        | 1/6 [01:10<05:50, 70.19s/it]
[A[A
 33%|###3      | 2/6 [04:49<07:39, 114.93s/it]
[A[A
 50%|#####     | 3/6 [08:32<07:22, 147.36s/it]
[A[A
 67%|######6   | 4/6 [12:16<05:40, 170.32s/it]
[A[A
 83%|########3 | 5/6 [18:10<03:45, 225.41s/


[A                                                                                    [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.8s

  4%|▍         | 2/50 [1:20:41<23:56:58, 1796.22s/trial, best loss: -0.08311038010471776][A
[A                                                                                      [Pipeline] ...... (step 2 of 3) Processing maxabs_scale, total=   2.1s

  4%|▍         | 2/50 [1:20:43<23:56:58, 1796.22s/trial, best loss: -0.08311038010471776][A
[A                                                                                      [Pipeline] ............. (step 3 of 3) Processing e_net, total=   3.1s

  4%|▍         | 2/50 [1:20:46<23:56:58, 1796.22s/trial, best loss: -0.08311038010471776][A
[A                                                                                      [Pipeline] ....... (step 1 of 3) Processing transformer, total=   2.7s

  4%|▍         | 2/50 [1:20:51<23:56:58, 1796.22s/trial, best loss: -0.0831103

  0%|          | 0/96 [00:00<?, ?it/s]
[A[A
  1%|1         | 1/96 [00:00<00:17,  5.33it/s]
[A[A
  2%|2         | 2/96 [00:00<00:19,  4.92it/s]
[A[A
 15%|#4        | 14/96 [00:00<00:11,  6.88it/s]
[A[A
 26%|##6       | 25/96 [00:00<00:07,  9.55it/s]
[A[A
 43%|####2     | 41/96 [00:00<00:04, 13.26it/s]
[A[A
 54%|#####4    | 52/96 [00:00<00:02, 18.01it/s]
[A[A
 64%|######3   | 61/96 [00:00<00:01, 23.43it/s]
[A[A
 78%|#######8  | 75/96 [00:01<00:00, 31.03it/s]
[A[A
 91%|######### | 87/96 [00:01<00:00, 37.15it/s]
[A[A
100%|##########| 96/96 [00:02<00:00, 20.50it/s]
[A[A
100%|##########| 96/96 [00:02<00:00, 44.19it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
[A[A
 17%|#6        | 1/6 [01:11<05:58, 71.64s/it]
[A[A
 33%|###3      | 2/6 [04:57<07:51, 117.76s/it]
[A[A
 50%|#####     | 3/6 [08:44<07:32, 150.76s/it]
[A[A
 67%|######6   | 4/6 [12:32<05:47, 173.82s/it]
[A[A
 83%|########3 | 5/6 [18:25<03:47, 227.49s/it]
[A[A


In [36]:
pickle.dump(best_param, open(out_dir.joinpath('best_param.pkl'), 'wb'))
pickle.dump(trials, open(out_dir.joinpath('trials.pkl'), 'wb'))

In [37]:
best_param

{'C': 0.06781152637797157,
 'l1_ratio': 0.2420458478281276,
 'w': 0.5267957740707717}

# Use the selected params to do the feature selection


In [38]:
enet_params = {k: v for k, v in best_param.items() if k != 'w'}

post_extraction_pipeline = Pipeline(
    [('transformer', MeanScaledArcsinhTransformer()),
     ('feature_selection', FeatureSelector(num_features=500, min_selections=4, n_jobs=30,
                                           feature_names=to_xtract, always_keep=dmdb_feat,
                                           random_state=4))], verbose=True)

In [39]:
dwpc = get_dwpc(best_param['w'])

Changing w from 0.7857592349063235 to 0.5267957740707717. Please wait...



  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<00:21,  4.51it/s][A
  2%|▏         | 2/96 [00:00<00:20,  4.59it/s][A
 10%|█         | 10/96 [00:00<00:13,  6.39it/s][A
 16%|█▌        | 15/96 [00:00<00:09,  8.64it/s][A
 21%|██        | 20/96 [00:00<00:06, 11.25it/s][A
 34%|███▍      | 33/96 [00:00<00:04, 15.48it/s][A
 42%|████▏     | 40/96 [00:01<00:02, 18.98it/s][A
 59%|█████▉    | 57/96 [00:01<00:01, 25.67it/s][A
 69%|██████▉   | 66/96 [00:01<00:01, 28.74it/s][A
100%|██████████| 96/96 [00:02<00:00, 46.69it/s][A

  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [01:41<08:25, 101.18s/it][A
 33%|███▎      | 2/6 [06:04<09:59, 149.86s/it][A
 50%|█████     | 3/6 [10:19<09:03, 181.33s/it][A
 67%|██████▋   | 4/6 [14:44<06:53, 206.56s/it][A
 83%|████████▎ | 5/6 [20:50<04:14, 254.39s/it][A
100%|██████████| 6/6 [24:48<00:00, 248.15s/it][A


In [40]:
post_extraction_pipeline.fit(dwpc[train_index], y[train_index])

[Pipeline] ....... (step 1 of 2) Processing transformer, total=  22.3s
Running Cor


  c /= stddev[:, None]
  c /= stddev[None, :]


Running Chi2


  chisq /= f_exp


Running RFE
Fitting estimator with 7303 features.
Fitting estimator with 5843 features.
Fitting estimator with 4383 features.
Fitting estimator with 2923 features.
Fitting estimator with 1463 features.
Running LR
Running RF
Running XG
[Pipeline] . (step 2 of 2) Processing feature_selection, total=14.7min


Pipeline(memory=None,
         steps=[('transformer',
                 <__main__.MeanScaledArcsinhTransformer object at 0x7f7c86e63ba8>),
                ('feature_selection',
                 FeatureSelector(always_keep={'CaAawD', 'CaBPawD', 'CaGeBPawD',
                                              'CaGiBPawD', 'CaGnrBPawD',
                                              'CaGpoAawD', 'CaGpoGeBPawD',
                                              'CaGpoGnrBPawD', 'CaGpoGprBPawD',
                                              'CaGpoPWiBPawD', 'CaGprBPawD',
                                              'CaGprBPpAsoD', 'CaGrBPawD',
                                              'CawPWawD', 'CiBPawD',
                                              'C...
                                                'ChoRXf>RXhiCdgD',
                                                'CinGeBPprGmD', 'ChiRXiBPrGawD',
                                                'CinGiBPrGtD', 'CaGnrBPnrGmD',
                

In [41]:
pe_feats = post_extraction_pipeline[1].keep_features_

In [52]:
len(pe_feats), len(fsel.keep_features_) 

(196, 196, 189)

In [46]:
len(set(fsel.keep_features_) - set(pe_feats))

38

In [54]:
feat_df = post_extraction_pipeline[1].feature_selection_df_
feat_df.sort_values('total', ascending=False).to_csv(out_dir.joinpath('feature_selection_df.csv'), index=False)

In [57]:
pickle.dump(post_extraction_pipeline[1], open(out_dir.joinpath('feature_selector.pkl'), 'wb'))
pd.Series(post_extraction_pipeline[1].keep_features_).to_csv(out_dir.joinpath('kept_features.txt'), index=False)