# The ElasticNet results with the best hyperparameters (cell type-specific models)

## Outline

The **MLAging - SVZ cell type** workflow consists of sections:

`30 SVZpreprocessing.R` Data preprocessing and preparation in Seurat.

`41 SVZ Cell Type ELN Tuning` ELN model tunning using *non-binarized* and *binarized* HVGs and hyperparameter selection using `GridSearchCV`.

`42 SVZ Cell Type ELN 10x` Run the best ELN model for both binarized and nonbinarized HVGs over 10 random seeds -- **this notebook**:
1. [Data Preparation](#1.-prep)
2. [Cell Types](#2.-celltypes)
    - [Neuron](#3.-neuron)
    - [Oligodendrycte](#4.-Oligo)
    - [Astrocyte](#5.-astro)
    - [OPC](#6.-opc)
    - [Microglia](#7.-micro)

`43 SVZ Cell Type ELN Result Viz` Result visulization.

`44 SVZ Cell Type Stat` Stat test on whether exercise rejuvenates cells.

In [1]:
import warnings
warnings.filterwarnings('ignore')

from src.data_processing import *
from src.grid_search import *
import os
import numpy as np
from sklearn.metrics import make_scorer
from tqdm import tqdm
import pickle
from sklearn.linear_model import LogisticRegression
from statistics import mean, stdev

data_type = 'float32'

## 1. Data Preparation <a name="1.-prep"></a>
### Load training, testing batch

In [2]:
input_train = '../data/svz_processed/svz_ctl_train_cell_sep3integ_batch1.csv'
input_test = '../data/svz_processed/svz_ctl_test_cell_sep3integ_batch2.csv'

In [3]:
pr_auc_scorer = make_scorer(pr_auc_score, greater_is_better=True,
                            needs_proba=True)

## 2. Cell-type specific models<a name="2.-celltypes"></a>

### a) Neuron <a name="2.-celltypes"></a><a name="3.-neuron"></a>

In [4]:
cell_type = 'Microglia'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Microglia


In [5]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=21.54434690031882, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [19:59<00:00, 119.94s/it]

auprc: 0.871709368953998 ± 9.738769001168399e-05





In [7]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

### b) Oligodendrycte <a name="2.-celltypes"></a><a name="4.-Oligo"></a> 

In [8]:
cell_type = 'Astrocyte_qNSC'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Astrocyte_qNSC


In [9]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=0.21544346900318823, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [02:06<00:00, 12.65s/it]

auprc: 0.955511237032972 ± 3.5772894173143e-05





In [10]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

### c) Atrocyte <a name="2.-celltypes"></a><a name="5.-astro"></a>

In [11]:
cell_type = 'Endothelial'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Endothelial


In [12]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=1, l1_ratio=1, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [14:41<00:00, 88.12s/it]

auprc: 0.8321483239878452 ± 4.713190266420696e-05





In [13]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

### d) OPC <a name="2.-celltypes"></a><a name="6.-opc"></a>

In [14]:
cell_type = 'Neuroblast'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Neuroblast


In [15]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=0.01, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [04:08<00:00, 24.85s/it]

auprc: 0.9144869901682415 ± 6.65160836601671e-05





In [16]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

### e) Microglia <a name="2.-celltypes"></a><a name="7.-micro"></a>

In [17]:
cell_type = 'Oligodendro'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Oligodendro


In [18]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=0.046415888336127774, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [00:55<00:00,  5.55s/it]

auprc: 0.9166542653542249 ± 7.4221804130394275e-06





In [19]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

In [20]:
cell_type = 'aNSC_NPC'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for aNSC_NPC


In [21]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=0.046415888336127774, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [02:39<00:00, 15.99s/it]

auprc: 0.8519057704095009 ± 6.666566946159971e-05





In [22]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()

In [23]:
cell_type = 'Mural'

train_X, train_y, test_X, test_y, custom_cv = data_prep(input_test, input_train,
                                                        cell_type, binarization=True)

Finished data prepration for Mural


In [24]:
scores = []
final_test = []
final_models = []
for i in tqdm(range(10)):
    random_state = 42*i    
    X_test, y_test = shuffle(test_X, test_y, random_state=random_state)
    X_train, y_train = shuffle(train_X, train_y, random_state=random_state)
    
    eln = LogisticRegression(penalty='elasticnet', C=0.046415888336127774, l1_ratio=0, 
                             solver='saga', max_iter=10000000)
        
    eln.fit(X_train, y_train)
    
    y_pred = eln.predict_proba(X_test)[:, 1]
    auprc = pr_auc_score(y_test, y_pred)
    
    final_test.append((X_test, y_test))
    final_models.append(eln)
    scores.append(auprc)   
print(f'auprc: {mean(scores)} ± {stdev(scores)}' )

100%|██████████| 10/10 [01:55<00:00, 11.60s/it]

auprc: 0.9639316251794376 ± 3.753241551391016e-06





In [25]:
file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_scores.save', 'wb')
pickle.dump(scores, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_sets.save', 'wb')
pickle.dump(final_test, file)
file.close()

file = open('../results/svz_int2/' + str(cell_type) + '_eln_model_test_models.save', 'wb')
pickle.dump(final_models, file)
file.close()