# Batch Integration Scheme ELN Tuning

## Outline

The **MLAging - batch integration and misc ** workflow consists of sections:

`60 preprocessing_batch.R` Data preprocessing and preparation in Seurat.

`61 Batch Integration Scheme ELN Tuning` Scheme: batch effects within training or test sets. ELN model tunning using highly variable genes (HVGs) and hyperparameter selection using `GridSearchCV` -- **this notebook**:

`62 Batch Integration Scheme ELN Result 10x` Run the best ELN model over 10 random seeds.

`63 HVG and Cell Type` Clustering and heatmap showing that HVGs are cell type-specific.

`64 AUROC Results` ELN 10x results shown in AUPRC metric.

`65 age_genes.R` Aging database queries.

In [1]:
from src.data_processing import *
from src.grid_search import *
from src.packages import *

In [2]:
def assign_target(input_df):
    input_df = pd.read_csv(input_df, index_col=0)
    input_df['animal'] = input_df.index.str[-1]
    input_df['target'] = ((input_df['animal'] =='3')|(input_df['animal']=='4')).astype(int)
    return input_df

In [3]:
def train_test_split(input_train, input_test, binarization=False):
    
    df_test = assign_target(input_test)
    test_X = df_test.iloc[:,:-2]
    test_y = df_test.target
    test_X, test_y = shuffle(test_X, test_y, random_state=42)

    df_train = assign_target(input_train)
    train = df_train.reset_index()
    custom_cv = customized_cv_index(train)
    
    train_X = train.iloc[:,1:-2]
    train_y = train.target
    
    if binarization==True:
        test_X = binarize_data(test_X)
        train_X = binarize_data(train_X)
    
    return train_X, train_y, test_X, test_y, custom_cv

In [4]:
pr_auc_scorer = make_scorer(pr_auc_score, greater_is_better=True,
                            needs_proba=True)

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
input_train = '../../MLAging/data/batch3_train_hvg2k_std_integrated.csv'
input_test = '../../MLAging/data/batch3_test_hvg2k_std_integrated.csv'

In [7]:
train_X, train_y, test_X, test_y, custom_cv = train_test_split(input_train, input_test, binarization=False)

In [8]:
train_X

Unnamed: 0,Avp,Oxt,Cfap299,Ebf1,Dnah12,Pmch,Flt1,Ptgds,Rgs5,Meis2,...,Car8,Sorbs2,Pde2a,Fstl1,Slitrk6,Gm4221,Ldlrad3,Vcl,Tle4,Gm44148
0,-0.509259,-1.144728,-0.093097,-0.300372,-0.142043,-1.333603,-0.347391,3.115985,0.573340,-0.286096,...,-0.021733,-0.876367,-0.353644,-0.386274,-0.403955,-0.676768,-0.572067,-0.270372,-0.759363,-0.264692
1,-0.984340,-0.975036,-0.189144,-0.276042,-0.190146,1.247295,-0.134240,-0.635552,-0.027549,0.081236,...,-0.219408,-1.427486,-1.347843,3.166655,0.010950,-0.068659,-0.676424,-0.755985,2.434765,0.591937
2,-0.735684,-0.402665,-0.330109,-0.323690,-0.019649,-1.178136,-0.097551,-0.724117,-0.065361,-0.100687,...,0.024698,-0.608726,-0.221356,-0.717360,-0.172234,0.178438,-0.276511,3.187716,-0.493330,-0.171677
3,-0.654245,-0.462408,0.082990,-0.375390,-0.138171,-0.186893,-0.082912,-0.107719,-0.106549,-0.304148,...,-0.036293,0.751487,-0.265048,0.274962,3.496850,-0.211647,-0.078547,-0.516713,-0.273520,-0.216251
4,0.559377,-0.759862,-0.164504,2.722906,-0.102529,0.510836,-0.073999,0.035376,-0.110131,-0.336583,...,-0.096198,1.201303,-0.565642,-0.995166,-0.275093,-0.141257,0.045704,-0.294031,0.292480,1.135494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23767,0.966447,1.458445,-0.139043,-0.133842,-0.262506,-0.466301,0.055841,-0.591483,-0.086876,-0.233327,...,-0.183547,0.633111,-0.477145,-0.272631,-0.196409,-0.737191,-0.382640,2.233622,-0.984521,-0.280405
23768,-0.257837,-0.361048,-0.122587,-0.237509,-0.541361,-0.261518,-0.168396,-0.480647,-0.086876,-0.272947,...,-0.068029,-0.282153,-0.727645,-0.364229,-0.238802,-0.356710,1.181125,0.743433,0.002713,0.271057
23769,-0.501087,0.477007,-0.126518,-0.288144,-0.144790,0.189079,-0.082218,-0.689822,-0.086876,-0.296461,...,-0.183547,0.965016,-0.273025,-0.159759,-0.178128,-0.589021,0.513837,-0.603632,-0.639802,-0.341648
23770,-0.198076,-0.169164,-0.177356,-0.293392,-0.090842,0.491395,-0.032165,-0.355162,-0.086876,-0.251959,...,-0.205929,0.626818,-0.869974,-0.237824,-0.278426,-0.377226,0.548814,1.626080,1.234555,0.158486


In [11]:
eln = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000000)

param_grid = {'logisticregression__C': np.logspace(-2, 2, 10),
             'logisticregression__l1_ratio': np.logspace(-3, 0, 6)}
models_eln = []
for i in tqdm(range(3)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, 
                                                eln, param_grid, i, custom_cv, pr_auc_scorer)
    
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_eln.append(grid)

 33%|███▎      | 1/3 [21:16<42:33, 1276.94s/it]

{'logisticregression__C': 0.01, 'logisticregression__l1_ratio': 0.25118864315095796}
best CV score: 0.5433825458090982
test score: 0.7449220562396575


 67%|██████▋   | 2/3 [43:37<21:54, 1314.47s/it]

{'logisticregression__C': 0.01, 'logisticregression__l1_ratio': 0.25118864315095796}
best CV score: 0.5433826801244166
test score: 0.7449222397129762


100%|██████████| 3/3 [1:07:01<00:00, 1340.62s/it]

{'logisticregression__C': 0.01, 'logisticregression__l1_ratio': 0.25118864315095796}
best CV score: 0.5433826126045462
test score: 0.7449223287056226





In [12]:
train_X, train_y, test_X, test_y, custom_cv = train_test_split(input_train, input_test, binarization=True)

In [13]:
train_X

Unnamed: 0,Avp,Oxt,Cfap299,Ebf1,Dnah12,Pmch,Flt1,Ptgds,Rgs5,Meis2,...,Car8,Sorbs2,Pde2a,Fstl1,Slitrk6,Gm4221,Ldlrad3,Vcl,Tle4,Gm44148
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23767,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
23769,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23770,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0


In [17]:
eln = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000000)

param_grid = {'logisticregression__C': np.logspace(-2, 2, 10),
             'logisticregression__l1_ratio': np.logspace(-3, 0, 6)}
models_eln = []
for i in tqdm(range(3)):
    grid, test_score = ML_pipeline_GridSearchCV(train_X, train_y, test_X, test_y, 
                                                eln, param_grid, i, custom_cv, pr_auc_scorer)
    
    print(grid.best_params_)
    print('best CV score:', grid.best_score_)
    print('test score:',test_score)
    models_eln.append(grid)

 33%|███▎      | 1/3 [10:01:55<20:03:51, 36115.96s/it]

{'logisticregression__C': 4.6415888336127775, 'logisticregression__l1_ratio': 1.0}
best CV score: 0.8889922014410434
test score: 0.9694666704580776


 67%|██████▋   | 2/3 [20:12:43<10:07:08, 36428.48s/it]

{'logisticregression__C': 4.6415888336127775, 'logisticregression__l1_ratio': 1.0}
best CV score: 0.8890118662894425
test score: 0.969456935819036


100%|██████████| 3/3 [31:33:58<00:00, 37879.54s/it]   

{'logisticregression__C': 4.6415888336127775, 'logisticregression__l1_ratio': 1.0}
best CV score: 0.8889486321740794
test score: 0.96944681246606



