In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%run ../../scripts/mfss_model_train.py
%run ../../scripts/utils/data_processing.py

## Load train and test data

In [None]:
data_path = '../dnam-clocks/mFSS-clock/data/processed/'

In [None]:
# Read in the Hannum dataset for training
hannum = pd.read_pickle(data_path + 'GSE40279_hannum/hannum32.pkl')
hannum_meta = pd.read_excel(data_path + 'GSE40279_hannum/hannum_pmeta.xlsx')

In [None]:
# Read in the Lehne datasat for testing
lehne = np.load(data_path + 'GSE55763_lehne/lehne32_reduced.npy')
lehne_meta = pd.read_excel(data_path + 'GSE55763_lehne/lehne_pmeta.xlsx')
lehne_cgs = list_reader(data_path + 'GSE55763_lehne/cgs')
lehne = pd.DataFrame(lehne, columns=lehne_cgs)

### Get the individual chronological age correlation of every CpG using the GSE40279 dataset as a reference (Requires download of processed GSE40279 dataset  Hannum32.pkl)

In [4]:
##Uncomment to run this process
# age_corrs = get_age_corrs(hannum, hannum_meta)
# age_corrs = age_corrs.reset_index(drop=True)

### Or read in the age correlations if you already have 'Hannum_age_corrs.xlsx' stored in 'cpg_sets' directory

In [5]:
age_corrs=pd.read_excel(data_path + 'cpg_sets/Hannum_age_corrs.xlsx')

### Filter out SNPs using the 450K manifest

In [None]:
manifest = pd.read_csv(data_path + 'metadata/450K_manifest.csv', low_memory=False)
# Get all SNP-associated CpGs in 450K array 
snps = set(manifest[~manifest.Probe_SNPs.isna()].Name)
no_snps = list(set(age_corrs.CpG)-snps)
age_corrs = age_corrs[age_corrs['CpG'].isin(no_snps)]

### GSE55763 has some missing CpGs so need to remove those from the age_corrs rows

In [6]:
overlap = list(set(lehne.columns)&set(age_corrs.CpG))

filtered_age_corrs= age_corrs[age_corrs['CpG'].isin(overlap)]
filtered_age_corrs.reset_index(inplace=True)

## Train an mFSS OLS model on the top 10,000 CpGs by strength of chronological age correlation.

In [None]:
model_cgs, best_iter, val_mse, val_r_val, test_mse, test_r_val = mfss_ols(filtered_age_corrs.CpG.tolist()[:10000], hannum, hannum_meta,
                                                                             lehne, lehne_meta, 150,flag=False)