In [None]:
## An Example of removing generating the original EN, then removing 1% of non-clock CpGs and creating a new EN model.
 
#import the necessary Libraries for this program import pandas as pd
from sklearn.linear_model 
import ElasticNetCV from sklearn.model_selection 
import train_test_split 
from sklearn.preprocessing 
import StandardScaler 
import random

In [None]:
#load in the original dataset
hannum_raw = pd.read_pickle('MethylAndAges/Hannum raw.pkl')

In [None]:
#Load in the ages associated with Each GSM, and title their Column as Age
ages = pd.read_pickle('MethylAndAges/Hannum ages.pkl')
ages = ages.rename(columns={ages.columns[0]: 'Age'})

In [None]:
#Transpose our data such that each row is a GSM. This is needed to allow Standardization with StandardScaler
hannum_test = hannum_raw.T hannum_test.columns.name = "CpG"

In [None]:
#Split the dataset into training and test subsets
methyl_raw_train, methyl_raw_test, age_train, age_test = train_test_split(hannum_test, ages, test_size=0.2, random_state=42)

In [None]:
#Scale data such that the fit is to the training set 
scaler = StandardScaler().fit(methyl_raw_train) methyl_train = scaler.transform(methyl_raw_train)

methyl_test = scaler.transform(methyl_raw_test)

In [None]:
#Create Elastic Net model
elastic_netCV_original = ElasticNetCV(l1_ratio = 0.5, n_alphas = 50, cv = 10, 
                                      n_jobs=1, random_state= 42, max_iter=5000, tol = 0.001, selection='cyclic')
#Train the model. 
elastic_netCV_original.fit(methyl_train, age_train)

In [None]:
#Get the non-zero coefficients to get the significant cpgs. coeffs_original = pd.DataFrame(elastic_netCV_original.coef_) coeffs_original = coeffs_original[(coeffs_original.T != 0).any()]
coeffs_original = coeffs_original.rename(columns={coeffs_original.columns[0]: 'Magnitude'})

In [None]:
#Get significant CpGs and their indices
colnames = pd.DataFrame(hannum_test.columns) sig_cpgs_original = colnames.iloc[coeffs_original.index]

In [None]:
# Create a list of all non clock CpGs first = True
for cpg in sig_cpgs_original["CpG"].to_list():

    if first == True:
        nonsig_cpgs_original = colnames[colnames["CpG"].str.contains(cpg)==False] 
        first = False
    else:
        nonsig_cpgs_original = nonsig_cpgs_original[nonsig_cpgs_original["CpG"].str.contains(cpg)==False]

In [None]:
#Find number to be removed
num_removed = int(len_nonsig_original*0.01)
 
#Generate CpGs to be removed
list_removed = random.sample(range(0, len_nonsig_original), num_removed) 
cpgs_removed_1 = nonsig_cpgs_original.iloc[list_removed]

In [None]:
#Generate a new data set with the randomly selected CpGs removed
hannum_test_1 = hannum_test.drop(cpgs_removed_1["CpG"].to_list(), axis=1) 

In [None]:
#ensure that all original clock CpGs remain in the new set
common = intersection(hannum_test_1.columns.to_list(), sig_cpgs_original["CpG"].to_list())

if not sorted(common) == sorted(sig_cpgs_original["CpG"].to_list()):
    raise ValueError('Some Significant CpGs lost!')

In [None]:
#Split the dataset into training and test subsets
methyl_raw_train, methyl_raw_test, age_train, age_test = train_test_split(hannum_test_1, ages, test_size=0.2, random_state=42)

#Scale our data such that the fit is to the training set 
scaler = StandardScaler().fit(methyl_raw_train) 
methyl_train = scaler.transform(methyl_raw_train)
methyl_test = scaler.transform(methyl_raw_test)

In [None]:
#Create Elastic Net model
elastic_netCV_1 = ElasticNetCV(l1_ratio = 0.5, n_alphas = 50, cv = 10, n_jobs=11, random_state = 42, max_iter=5000, tol = 0.001, selection='cyclic')
 
#Train the model. 
elastic_netCV_1.fit(methyl_train,Y_train)

In [None]:
#an example of the analysis which generate the data for part of figure 3A


#import libraries used in this file import pandas as pd
from joblib import dump, load

In [None]:
#When given two lists as inputs this function finds all of the common items between the two lists and returns
#the results as a list.
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2] return lst3

#When given two lists as inputs this function finds all of the items in list1, which are not also in list two, 
#and returns these items as a list.
def loss(lst1, lst2):
lst3 = [value for value in lst1 if value not in lst2] return lst3

In [None]:
#Here we load models with 1-10% of Nonsig removed and find the desired quantities for i in range(1, 11):

#Load the Elastic Net model and Dataset with the desired percent of nonsig CpGs removed 
enet = load('elastic_netCV_Hannum_' + str(i) + '_i5000.joblib')
dataset = pd.read_pickle('hannum_' + str(i) + '% nonsig removed.pkl')

In [None]:
#Get the non-zero coefficients to get the significant cpgs. 
coeffs_set = pd.DataFrame(enet.coef_)
coeffs_set = coeffs_set[(coeffs_set.T != 0).any()]
 
coeffs_set = coeffs_set.rename(columns={coeffs_set.columns[0]: 'Magnitude'})

In [None]:
#Get significant CpGs and their indices 
colnames_set = pd.DataFrame(dataset.columns) 
sig_cpgs_set = colnames_set.iloc[coeffs_set.index]

In [None]:
#find clock CpGs similiar between the original clock and the new dataset
common = intersection(sig_cpgs_set["CpG"].to_list(), sig_cpgs_original["CpG"].to_list()) 

In [None]:
print("The model with " + str(i) + " % of nonsig CpGs removed has " + str(len(sig_cpgs_set["CpG"].to_list())) + " Clock CpGs.")
print("The model with " + str(i) + " % of nonsig CpGs removed has " + str(len(common)) + " Clock CpGs in common with the original EN")

In [None]:
#find clock CpGs from the original Clock not included in the new set
lost_cpg = loss(sig_cpgs_original["CpG"].to_list(), sig_cpgs_set["CpG"].to_list() )
print("The model with " + str(i) + " % of nonsig CpGs removed does not use " + str(len(lost_cpg)) + " of the Clock CpGs in the original model")