In [1]:
import sys
sys.path.append("../scripts")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from clock_utils import make_groups
import shelve, os, dbm.dumb
from datetime import datetime
from sklearn.model_selection import LeavePGroupsOut
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

%load_ext autoreload
%autoreload 2

basepath = "../data/paper_data/"

In [2]:
# Get preprocessed data (already log scaled)
data_crct_none = pd.read_csv(basepath + "data_atac_crct_none.tsv", index_col=0, sep="\t")
data_crct_ccomp = pd.read_csv(basepath + "data_atac_crct_ccomp.tsv", index_col=0, sep="\t")
data_crct_ccomp_no_age = pd.read_csv(basepath + "data_atac_crct_ccomp_no_age.tsv", index_col=0, sep="\t")
data_crct_age = pd.read_csv(basepath + "data_atac_crct_age.tsv", index_col=0, sep="\t")

# Get meta
meta = pd.read_csv(basepath + "meta_final.tsv", sep="\t")
meta = meta.set_index("Subject", drop=False)

# Subset all tables to common samples
csamps = data_crct_ccomp.index.values
data_crct_none = data_crct_none.loc[csamps, ]
data_crct_ccomp = data_crct_ccomp.loc[csamps, ]
data_crct_ccomp_no_age = data_crct_ccomp_no_age.loc[csamps, ]
data_crct_age = data_crct_age.loc[csamps, ]

# Get cell composition data for common samples
data_ccomp_only = meta.loc[csamps, ['Monocytes', 'Granulocytes', 'Lymphocytes', 'B_Cells', 
                                    'NK_Cells', 'T_Cells', 'CD4_T_Cells', 'CD8_T_Cells']]

# Get ages for common samples
labels = meta.loc[csamps, "Age"]

groups = make_groups(11, labels)

In [3]:
pipes = list()

# Model 1

pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("regressor", ElasticNet(max_iter=5000, tol=0.0005))])

param_grid = dict()
param_grid["regressor__alpha"] = np.logspace(-4, 1, 30) #40
param_grid["regressor__l1_ratio"] = [0.7, 0.8, 0.9, 0.95, 0.99, 1]

In [4]:
outer_groups_out = 1
scoring = "neg_median_absolute_error"
cv_outer = LeavePGroupsOut(outer_groups_out)
outer_split = list(cv_outer.split(data_crct_none, labels, groups)) # Can be applied to all datasets

### 1) Train on cell composition only ###

In [9]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_ccomp_only"
os.makedirs(outpath, exist_ok=True)

data = data_ccomp_only
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 2) Train on uncorrected data ###

In [10]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_crct_none_vs_ccomp"
os.makedirs(outpath, exist_ok=True)

data = data_crct_none
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 3) Train on uncorrected data + cell composition ###

In [13]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_crct_none_plus_ccomp"
os.makedirs(outpath, exist_ok=True)

data = data_ccomp_only.join(data_crct_none)
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 4) Train on data corrected for cell composition, knowing age ###

In [14]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_crct_ccomp_vs_none"
os.makedirs(outpath, exist_ok=True)

data = data_crct_ccomp
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 5) Train on data corrected for cell composition, not knowing age ###

In [15]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_crct_ccomp_no_age"
os.makedirs(outpath, exist_ok=True)

data = data_crct_ccomp_no_age
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 6) Train on data corrected for age, keeping cell composition effects ###

In [5]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_crct_age"
os.makedirs(outpath, exist_ok=True)

data = data_crct_age
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring


### 7) Correct for cell composition in ncv outer folds ###

In [17]:
date = datetime.now().strftime("%Y-%m-%d_%H-%M")
outpath = "../clocks/parallel/" + date + "_ccomp_in_ncv"
os.makedirs(outpath, exist_ok=True)

data = data_crct_none
ccomp_regressor = ["lm", "RANSAC", "Huber"][0]
vars_to_save = ["data", "labels", "groups", "pipe", "param_grid", "outer_split", "scoring",
               "data_ccomp_only", "ccomp_regressor"]

dumbdb = dbm.dumb.open(outpath + "/dataset")
dataset = shelve.Shelf(dumbdb)
for key in vars_to_save:
    try:
        dataset[key] = globals()[key]
        print('Shelved {0}'.format(key))
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
dataset.close()
dumbdb.close()

Shelved data
Shelved labels
Shelved groups
Shelved pipe
Shelved param_grid
Shelved outer_split
Shelved scoring
Shelved data_ccomp_only
Shelved ccomp_regressor
