In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%run ../../scripts/model_prediction_analyses.py

## Load the metadata for the reference `V7 combined` composite dataset (GSE42861, GSE125105, GSE72774, GSE106648) and separate it by healthy and disease cohorts

In [3]:
v7_meta = pd.read_excel('../../data/processed/metadata/V7_pmeta.xlsx')
h_meta = v7_meta.copy()
d_meta = v7_meta.copy()
h_meta = h_meta[h_meta.disease==0]
d_meta = d_meta[d_meta.disease==1]
h_meta.reset_index(drop=True, inplace=True)
d_meta.reset_index(drop=True, inplace=True)

## Load the AdaptAge model and the beta values of the healthy cohort of the V7 combined dataset filtered on the AdaptAge model's CpG selection

In [4]:
# Load the model
adapt = pd.read_csv('../../data/processed/models/AdaptAge/AdaptAge.csv')

# Read in the healthy cohort of the V7 combined dataset filtered on the AdaptAge model's CpG selection
h_adapt = pd.read_pickle('../../data/processed/models/AdaptAge/Healthy.pkl')

## Get the AdaptAge prediction residuals for the V7 healthy cohort, compute the summary statistics for the residuals and fit a normal distribution to the residuals. All of this data will later be used to generate Figure 2, Supplementary Figure 2 and Table 1

In [5]:
# Prep the AdaptAge model, giving it the right column names and CpG order matching that of the dataset
adapt=prep_model(adapt)

In [21]:
help(prep_model)

Help on function prep_model in module __main__:

prep_model(model)
    Prepare a model for making predictions.
    
    This function expects a DataFrame with two columns:
    - The first column should contain the CpG sites for the model.
    - The second column should contain the corresponding weights or importance scores.
    
    The function renames the columns to 'CpG' and 'Weight' and sorts the DataFrame
    by the 'CpG' column.
    
    Parameters:
    data (pd.DataFrame): The input DataFrame containing the dataset.
    
    Returns:
    pd.DataFrame: The processed DataFrame with columns renamed and sorted by 'CpG'.



In [6]:
# Get the AdaptAge predictions
adapt_preds = get_preds(h_adapt, adapt) 

In [7]:
# Calculate the residuals
adapt_resids = get_residuals(h_adapt, h_meta, adapt,'AdaptAge')

In [8]:
# Calculate the mean and standard deviation of the residuals
mu, std = get_mu_std(adapt_resids)

In [9]:
# Calculate the thresholds for the 95% interpercentile range (95-IPR) of a normal distribution fit to the residuals
cutoffs = get_cutoffs(mu, std)

In [10]:
# Generate the PDF for the AdaptAge residuals for the V7 healhty cohort
h_adapt_dist = create_residual_distribution(adapt_resids, mu, std, 'AdaptAge')

In [23]:
# Compile the summary statistics for the AdaptAge residuals for the V7 healthy cohort
h_adapt_stats = [('AdaptAge',cutoffs[0], cutoffs[1], round(mu, 2))]

## The model_errs_and_dist function performs all of the above. Use it to generate the distribution and summary statistics for the other models 

In [15]:
# Load the other models
horvath = pd.read_excel('../../data/processed/models/Horvath/Horvath model.xlsx')
hannum = pd.read_excel('../../data/processed/models/Hannum/Hannum model.xlsx')
pheno = pd.read_csv('../../data/processed/models/PhenoAge/PhenoAge.csv')
dam = pd.read_csv('../../data/processed/models/DamAge/DamAge.csv')
caus = pd.read_csv('../../data/processed/models/CausAge/CausAge.csv')

# Read in the healthy cohort of the V7 combined dataset filtered on the respective models' CpG selections
h_horvath = pd.read_pickle('../../data/processed/models/Horvath/Healthy.pkl')
h_hannum = pd.read_pickle('../../data/processed/models/Hannum/Healthy.pkl')
h_pheno = pd.read_pickle('../../data/processed/models/PhenoAge/Healthy.pkl')
h_dam = pd.read_pickle('../../data/processed/models/DamAge/Healthy.pkl')
h_caus = pd.read_pickle('../../data/processed/models/CausAge/Healthy.pkl')

In [18]:
# Read in the patient cohort of the V7 combined dataset filtered on the respective models' CpG selections
d_horvath = pd.read_pickle('../../data/processed/models/Horvath/Disease.pkl')
d_hannum = pd.read_pickle('../../data/processed/models/Hannum/Disease.pkl')
d_pheno = pd.read_pickle('../../data/processed/models/PhenoAge/Disease.pkl')
d_adapt = pd.read_pickle('../../data/processed/models/AdaptAge/Disease.pkl')
d_dam = pd.read_pickle('../../data/processed/models/DamAge/Disease.pkl')
d_caus = pd.read_pickle('../../data/processed/models/CausAge/Disease.pkl')

In [16]:
# Get the summary statistics and distributions for the healthy cohorts of the other models
h_horvath_stats, h_horvath_fit = model_errs_and_dist(h_horvath, h_meta, horvath, 'Horvath')
h_hannum_stats, h_hannum_fit = model_errs_and_dist(h_hannum, h_meta, hannum, 'Hannum')
h_pheno_stats, h_pheno_fit = model_errs_and_dist(h_pheno, h_meta, pheno, 'Horvath')
h_dam_stats, h_dam_fit = model_errs_and_dist(h_dam, h_meta, dam, 'DamAge')
h_caus_stats, h_caus_fit = model_errs_and_dist(h_caus, h_meta, caus, 'CausAge')

In [19]:
# Get the summary statistics and distributions for the patient cohorts of the other models
d_horvatd_stats, d_horvatd_fit = model_errs_and_dist(d_horvath, d_meta, horvath, 'Horvath')
d_hannum_stats, d_hannum_fit = model_errs_and_dist(d_hannum, d_meta, hannum, 'Hannum')
d_pheno_stats, d_pheno_fit = model_errs_and_dist(d_pheno, d_meta, pheno, 'Horvath')
d_dam_stats, d_dam_fit = model_errs_and_dist(d_dam, d_meta, dam, 'DamAge')
d_caus_stats, d_caus_fit = model_errs_and_dist(d_caus, d_meta, caus, 'CausAge')