In [None]:

import pandas as pd
import numpy as np
import os
import sys 
from tqdm import tqdm




Albumin 30600
Creatinine 30700
Glucose 30740
C-Reactive Protein 30710
Lymphocyte Percentage 30120
Mean Cell Volume 30040 ？？？
Red Cell Distribution Width 30070
Alkaline Phosphatase 30610
White Blood Cell Count 30000
Chronological Age 21003

In [None]:

id_map = {'30600-0.0':'Albumin', '30700-0.0':'Creatinine', '30740-0.0':'Glucose',
          '30710-0.0':'C-Reactive Protein', '30120-0.0':'Lymphocyte Percentage',
          '30040-0.0':'Mean Cell Volume', '30070-0.0':'Red Cell Distribution Width',
          '30610-0.0':'Alkaline Phosphatase', '30000-0.0':'White Blood Cell Count',
          '21003-0.0':'Chronological Age', '53-0.0':'Date of Attendance',}
cols = ['eid','30600-0.0', '30700-0.0', '30740-0.0', '30710-0.0', '30120-0.0', '30040-0.0', '30070-0.0', 
        '30610-0.0', '30000-0.0', '21003-0.0', '53-0.0']


In [None]:
ukb_data_path = ''
df = pd.read_parquet(ukb_data_path)[cols]
processed_df = df[df[df.columns[1:11]].isna().sum(axis=1)<=0]
processed_df.rename(columns=id_map, inplace=True)
columns_to_convert = [
    'Albumin', 'Creatinine', 'Glucose', 'C-Reactive Protein',
    'Lymphocyte Percentage', 'Mean Cell Volume', 'Red Cell Distribution Width',
    'Alkaline Phosphatase', 'White Blood Cell Count', 'Chronological Age'
]

for column in columns_to_convert:
    processed_df[column] = pd.to_numeric(processed_df[column], errors='coerce')


QC

In [None]:
def filter_percentiles(df, lower_percentile=0.01, upper_percentile=0.99, columns= columns_to_convert[:-1]):
    """
    Remove rows from the DataFrame that are below the lower_percentile
    and above the upper_percentile for each numeric column.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - lower_percentile (float): The lower percentile threshold (default is 0.01).
    - upper_percentile (float): The upper percentile threshold (default is 0.99).

    Returns:
    - pd.DataFrame: The filtered DataFrame.
    """
    df_filtered = df.copy()
    for column in columns:
        lower_bound = df_filtered[column].quantile(lower_percentile)
        upper_bound = df_filtered[column].quantile(upper_percentile)
        df_filtered = df_filtered[(df_filtered[column] >= lower_bound) & (df_filtered[column] <= upper_bound)]
    return df_filtered
processed_df = filter_percentiles(processed_df)
len(processed_df)


In [None]:
columns = [
    'Albumin', 'Creatinine', 'Glucose', 'C-Reactive Protein',
    'Lymphocyte Percentage', 'Mean Cell Volume', 'Red Cell Distribution Width',
    'Alkaline Phosphatase', 'White Blood Cell Count', 'Chronological Age'
]

# Fill missing values with the mean of each column
# processed_df[columns] = processed_df[columns].fillna(processed_df[columns].mean())

In [None]:
# Define the calculate_phenoage function
def calculate_phenoage(albumin, creatinine, glucose, crp, lymphocyte_percent,
                       mean_corpuscular_volume, red_cell_distribution_width,
                       alkaline_phosphatase, white_blood_cell_count, age):
      crp = crp*0.1 # (mg/L to mg/dL)
      ln_crp = np.log(crp)
      xb = (-19.907 +
          (-0.0336 * albumin) +
          (0.0095 * creatinine) +
          (0.1953 * glucose) +
          (0.0954 * ln_crp) +
          (-0.0120 * lymphocyte_percent) +
          (0.0268 * mean_corpuscular_volume) +
          (0.3306 * red_cell_distribution_width) +
          (0.00188 * alkaline_phosphatase) +
          (0.0554 * white_blood_cell_count) +
          (0.0804 * age))
#     M = 1 - np.exp(-1.51714 * np.exp(xb))
#     phenoage = 141.50225 + np.log(-0.00553 * np.log(1 - M)) / 0.09165
      M = np.log((-0.00553)*((-1.51714* np.exp(xb))/0.0076927))
      phenoage = 141.50225 + M / 0.09165
      return phenoage

In [None]:
processed_df['PhenoAge'] = processed_df.apply(lambda x: calculate_phenoage(x['Albumin'], x['Creatinine'], x['Glucose'], x['C-Reactive Protein'],
                                                        x['Lymphocyte Percentage'], x['Mean Cell Volume'], x['Red Cell Distribution Width'],
                                                        x['Alkaline Phosphatase'], x['White Blood Cell Count'], x['Chronological Age']), axis=1)
processed_df['diff'] = processed_df['PhenoAge']- processed_df['Chronological Age']
processed_df.to_csv('data/ukb_phenoage_nomissing.csv', index=False)

In [None]:
death_cause = ['eid', '40001-0.0', '40002-0.0']
df = pd.read_parquet(ukb_data_path)
df_cause = df[death_cause]
df_cause.rename(columns={'40001-0.0':'Primary Cause', '40002-0.0':'Secondary Cause'}, inplace=True)
df_cause['Cancer_Flag'] = df_cause[['Primary Cause', 'Secondary Cause']].apply(
    lambda row: 1 if str(row['Primary Cause']).startswith('C') or str(row['Secondary Cause']).startswith('C') else 0,
    axis=1)
# Flag for cardiovascular diseases
df_cause['Cardio_Flag'] = df_cause[['Primary Cause', 'Secondary Cause']].apply(
    lambda row: 1 if str(row['Primary Cause']).startswith('I') or str(row['Secondary Cause']).startswith('I') else 0,
    axis=1)


add label infor

In [None]:
ukb_death = pd.read_csv('ukb/death.txt', sep='\t')
ukb_death.rename(columns={'eid':'patid'}, inplace=True)
df = processed_df.copy()
df.rename(columns={'eid':'patid'}, inplace=True)
df_new = df.merge(ukb_death, on='patid', how='left')
df_new['Date of Attendance'] = pd.to_datetime(df_new['Date of Attendance'])
df_new['date_of_death'] = pd.to_datetime(df_new['date_of_death'])
df_new['end_followup_inter'] = df_new['Date of Attendance'] + pd.DateOffset(years=10)
df_new['end_followup'] = df_new[['end_followup_inter', 'date_of_death']].min(axis=1)
df_new['event'] = np.where(
    (df_new['date_of_death'].notna()) & (df_new['date_of_death'] <= df_new['end_followup']),1, 0)

In [None]:
# Create 'time' column
df_new['time2death'] = np.where(
    df_new['event'] == 1,
    (df_new['date_of_death'] - df_new['Date of Attendance']).dt.days,
    (df_new['end_followup'] - df_new['Date of Attendance']).dt.days
)
df_cause.rename(columns={'eid':'patid'}, inplace=True)
df_cause['patid'] = df_cause['patid'].astype(int)
df_new = df_new.merge(df_cause[['patid', 'Cancer_Flag', 'Cardio_Flag']], on='patid', how='left')
# remove cases after folloup
mask = (df_new['Cancer_Flag'] == 1) & (df_new['event'] == 0)
df_new.loc[mask, 'Cancer_Flag'] = 0
mask = (df_new['Cardio_Flag'] == 1) & (df_new['event'] == 0)
df_new.loc[mask, 'Cardio_Flag'] = 0

In [None]:
df_new[['patid', 'date_of_death', 'event', 'time2death', 'Cancer_Flag', 'Cardio_Flag']].to_csv('data/ukb_death.csv', index=False)

Disease Label

In [None]:
disease = pd.read_parquet('EHR_age/UKB_incidence.parquet') 
disease['patid'] = disease['patid'].astype(int)
disease= disease.merge(df_new[['patid', 'date_of_death', 'event','time2death']], on='patid', how='left')
df = pd.read_csv('data/ukb_phenoage.csv')
disease_cols = disease.columns[7:-3].to_list()

In [None]:
df.rename(columns={'eid':'patid'}, inplace=True)
df_new = df.merge(disease, on='patid', how='left')
print(len(df_new))
df_new['Date of Attendance'] = pd.to_datetime(df_new['Date of Attendance'])
df_new['end_followup_inter'] = df_new['Date of Attendance'] + pd.DateOffset(years=10)
# add date_of_death, new 5th Nov 2025
df_new['end_followup'] = df_new[['end_followup_inter', 'date_of_death']].min(axis=1)

In [None]:
diseases_events = []
disease_whole = []
disease_number = []
disease_percantage = []

for dd in disease_cols:
    disease_e = dd[:-4] +'event'
    diseases_events.append(disease_e[:-6])
    disease_t = 'time2_' +dd[:-5]
    df_new[dd] = pd.to_datetime(df_new[dd])
    df_new_to_analyses = df_new[~(df_new[dd] < df_new['Date of Attendance'])]
    
    disease_whole.append(len(df_new_to_analyses))
    df_new_to_analyses[disease_e] = np.where((df_new_to_analyses[dd].notna()) & (df_new_to_analyses[dd] <= df_new_to_analyses['end_followup']),1, 0)
    df_new_to_analyses[disease_t] = np.where(
    df_new_to_analyses[disease_e] == 1,
    (df_new_to_analyses[dd] - df_new_to_analyses['Date of Attendance']).dt.days,
    (df_new_to_analyses['end_followup'] - df_new_to_analyses['Date of Attendance']).dt.days)
    df_save = df_new_to_analyses[['patid', disease_e, disease_t, 'event', 'time2death']]
    # add death censoring
    df_save.to_csv(f'data/disease_ukb/{disease_e}_death_censored.csv', index=False)
    disease_number.append(df_save[disease_e].sum())
    disease_percantage.append(df_save[disease_e].sum()/len(df_new_to_analyses))


In [None]:
disease_summary = pd.DataFrame({'disease_name':diseases_events, 
                                'disease_number':disease_number, 
                                'disease_percantage':disease_percantage,
                                'whole population':disease_whole})
disease_summary.to_csv('results/ukb_disease_summary_death_censor.csv', index=False)
