In [2]:
import pandas as pd
import numpy as np 
import copy
from tqdm import tqdm
from scipy.stats import norm 
import matplotlib.pyplot as plt 
%matplotlib inline

# Calculating BMI z-score using Reference Chart from CDC
ref: https://www.cdc.gov/growthcharts/extended-bmi-data-files.htm

In [3]:
def get_reference_params(subj_age:float, Ref_Chart: pd.DataFrame): 
    for i in range(len(Ref_Chart) - 1): 
        start = Ref_Chart['agemos'][i]
        end = Ref_Chart['agemos'][i+1]
        if subj_age >= start and subj_age < end: 
            if subj_age <= start + 0.5: 
                reference_params = Ref_Chart.iloc[i]
            elif subj_age > start + 0.5: 
                reference_params = Ref_Chart.iloc[i+1]
        else: 
            pass 
    return reference_params  


def sds_calculator(BMI, L, M, S): 
    numerator = (BMI / M) ** L -1   # 분자 
    denominator = L * S     # 분모
    return numerator / denominator


def calculating_subj_BMIsds(subj: pd.Series, Ref_Chart: pd.DataFrame) -> float: 
    age = subj['age']
    BMI = subj['BMI']
    sex = subj['sex']
    sex_Ref_Chart = Ref_Chart[Ref_Chart['sex'] == sex].reset_index(drop=True)
    reference_params = get_reference_params(subj_age=age, Ref_Chart=sex_Ref_Chart)
    subj_BMIsds = sds_calculator(BMI=BMI, L=reference_params['L'], M=reference_params["M"], S=reference_params["S"])
    
    P5 = reference_params['P5']
    P85 = reference_params['P85']

    if BMI <= P5: 
        status = "underweight"
    elif BMI > P5 and BMI < P85: 
        status = "normal"
    elif BMI >= P85: 
        status = "overweight"
    return subj_BMIsds, status



## Baseline Year

In [6]:
Ref_Chart_dir = '/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/bmi-age-2022.csv'
Ref_Chart = pd.read_csv(Ref_Chart_dir)

phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype[['subjectkey','age', 'sex', 'BMI']] 
phenotype_tmp = phenotype_tmp.dropna(axis=0)

# calculating BMI sds of each subject
BMIsds = []
BMIstatus = [] 

for i in tqdm(range(len(phenotype_tmp))): 
    subj_BMIsds, status = calculating_subj_BMIsds(subj = phenotype_tmp.iloc[i], Ref_Chart=Ref_Chart)
    BMIsds.append(subj_BMIsds)
    BMIstatus.append(status)
    

phenotype_tmp['BMI_status'] = np.array(BMIstatus)
phenotype_tmp['BMI_sds'] = np.array(BMIsds) 
phenotype_tmp = phenotype_tmp.drop(['age', 'sex', 'BMI'], axis=1)
phenotype_final = pd.merge(phenotype, phenotype_tmp, how='left', left_on='subjectkey', right_on='subjectkey')
phenotype_final = phenotype_final.loc[(phenotype_final['BMI_sds'] < 5) & (phenotype_final['BMI_sds'] > -5)]
phenotype_final.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv', index=False)



100%|██████████| 11493/11493 [00:16<00:00, 703.65it/s]


KeyError: 'BMI_sds'

## 1 year follow up

In [7]:
def get_reference_params(subj_age:float, Ref_Chart: pd.DataFrame): 
    for i in range(len(Ref_Chart) - 1): 
        start = Ref_Chart['agemos'][i]
        end = Ref_Chart['agemos'][i+1]
        if subj_age >= start and subj_age < end: 
            if subj_age <= start + 0.5: 
                reference_params = Ref_Chart.iloc[i]
            elif subj_age > start + 0.5: 
                reference_params = Ref_Chart.iloc[i+1]
        else: 
            pass 
    return reference_params  


def sds_calculator(BMI, L, M, S): 
    numerator = (BMI / M) ** L -1   # 분자 
    denominator = L * S     # 분모
    return numerator / denominator



def calculating_subj_BMIsds(subj: pd.Series, Ref_Chart: pd.DataFrame) -> float: 
    age = subj['age_1year']
    BMI = subj['BMI_1year']
    sex = subj['sex']
    sex_Ref_Chart = Ref_Chart[Ref_Chart['sex'] == sex].reset_index(drop=True)
    reference_params = get_reference_params(subj_age=age, Ref_Chart=sex_Ref_Chart)
    subj_BMIsds = sds_calculator(BMI=BMI, L=reference_params['L'], M=reference_params["M"], S=reference_params["S"])

    P5 = reference_params['P5']
    P85 = reference_params['P85']

    if BMI <= P5: 
        status = "underweight"
    elif BMI > P5 and BMI < P85: 
        status = "normal"
    elif BMI >= P85: 
        status = "overweight"
    return subj_BMIsds, status


In [9]:
Ref_Chart_dir = '/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/bmi-age-2022.csv'
Ref_Chart = pd.read_csv(Ref_Chart_dir)

phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype[['subjectkey','age', 'sex', 'weight','BMI','BMI_sds', 'BMI_status']] 
phenotype_tmp = phenotype_tmp.dropna(axis=0)
phenotype_tmp.rename(columns={'BMI':'BMI_baseline', 'BMI_sds':'BMI_sds_baseline', 'BMI_status': 'BMI_status_baseline', 'weight':'weight_baseline'}, inplace=True)

phenotype1 = pd.read_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD Release4.0 Tabular dataset.csv")
phenotype1 = phenotype1[phenotype1['eventname'] == "1_year_follow_up_y_arm_1"]
phenotype1 = phenotype1.drop(['eventname','sex'], axis=1)
phenotype1.rename(columns={'bmi':'BMI_1year', 'weight':'weight_1year'}, inplace=True)
phenotype1 = phenotype1.reset_index(drop=True)
for i in range(len(phenotype1['subjectkey'].values)):
    phenotype1['subjectkey'].values[i] = phenotype1['subjectkey'].values[i].replace('_','')

phenotype1_tmp = phenotype1[['subjectkey', 'BMI_1year', 'weight_1year']]
phenotype1_tmp = phenotype1_tmp.dropna(axis=0)
phenotype1_tmp = pd.merge(phenotype_tmp, phenotype1_tmp, how='inner', on='subjectkey' )
# add 12 months to baseline age
phenotype1_tmp['age_1year'] = phenotype1_tmp['age'].values + 12 

# calculating BMI sds of each subject
BMIsds = []
BMIstatus = [] 

for i in tqdm(range(len(phenotype1_tmp))): 
    subj_BMIsds, status = calculating_subj_BMIsds(subj = phenotype1_tmp.iloc[i], Ref_Chart=Ref_Chart)
    BMIsds.append(subj_BMIsds)
    BMIstatus.append(status)
    

phenotype1_tmp['BMI_status_1year'] = np.array(BMIstatus)
phenotype1_tmp['BMI_sds_1year'] = np.array(BMIsds) 
phenotype1_tmp['BMI_sds_change'] = phenotype1_tmp['BMI_sds_1year'].values - phenotype1_tmp['BMI_sds_baseline'].values
phenotype1_tmp['BMI_change'] = phenotype1_tmp['BMI_1year'].values - phenotype1_tmp['BMI_baseline'].values
phenotype1_tmp['weight_change'] = phenotype1_tmp['weight_1year'].values - phenotype1_tmp['weight_baseline'].values
phenotype1_tmp = phenotype1_tmp.drop(['age', 'age_1year','sex', 'BMI_1year'], axis=1)
phenotype1_final = pd.merge(phenotype1, phenotype1_tmp, how='left', left_on='subjectkey', right_on='subjectkey')
phenotype1_final = phenotype1_final.loc[(phenotype1_final['BMI_sds_1year'] < 5) & (phenotype1_final['BMI_sds_1year'] > -5)]
phenotype1_final.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_1years_revised.csv', index=False)







100%|██████████| 10792/10792 [00:15<00:00, 675.51it/s]


In [10]:
gain_template = np.zeros(len(phenotype1_final))
for i in range(len(phenotype1_final)): 
    difference = phenotype1_final['BMI_sds_change'].values[i]
    if difference >= 0.2 :
        gain_template[i] = 1
    elif  difference < 0.2 and difference > -0.2:
        pass
    elif difference <= - 0.2 : 
        gain_template[i] = np.nan


loss_template = np.zeros(len(phenotype1_final))
for i in range(len(phenotype1_final)): 
    difference = phenotype1_final['BMI_sds_change'].values[i]
    if difference >= 0.2 :
        loss_template[i] = np.nan
    elif  difference < 0.2 and difference > -0.2:
        pass
    elif difference <= - 0.2 : 
        loss_template[i] = 1


phenotype1_final['BMI_gain'] = gain_template
phenotype1_final['BMI_loss'] = loss_template

In [11]:
phenotype_gain = phenotype1_final.dropna(axis=0, subset=['BMI_gain'], inplace=False)
phenotype_gain = phenotype_gain.drop(['BMI_loss'], axis=1)
phenotype_gain =phenotype_gain.reset_index(drop=True)
phenotype_gain.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_1years_revised_BMIgain.csv", index=False)

phenotype_loss = phenotype1_final.dropna(axis=0, subset=['BMI_loss'], inplace=False)
phenotype_loss = phenotype_loss.drop(['BMI_gain'], axis=1)
phenotype_loss =phenotype_loss.reset_index(drop=True)
phenotype_loss.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_1years_revised_BMIloss.csv", index=False)




In [12]:
# (baseline) abnormal to (1 year) normal acceleration 
abnormal_normal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_1year'] == 'normal') & (phenotype_gain['BMI_gain'] == 1)]
abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
# (baseline) normal to (1 year) abnormal acceleration 
normal_abnormal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_1year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)]
normal_abnormal['become_overweight'] = np.array([1 for x in range(len(normal_abnormal))])
# normal to normal 
normal_normal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_1year'] == 'normal') & (phenotype_gain['BMI_gain'] == 0)]
normal_normal['become_overweight'] = np.array([0 for x in range(len(normal_normal))])
normal_normal['become_normal'] = np.array([0 for x in range(len(normal_normal))])

underweight = pd.concat([abnormal_normal, normal_normal])
underweight = underweight.reset_index(drop=True)
underweight = underweight[['subjectkey', 'become_normal']]
overweight = pd.concat([normal_abnormal, normal_normal])
overweight = overweight.reset_index(drop=True) 
overweight = overweight[['subjectkey', 'become_overweight']]

new_df_gain = pd.merge(phenotype_gain, overweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_gain = pd.merge(new_df_gain, underweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_gain.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_1years_revised_BMIgain.csv", index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_abnormal['become_overweight'] = np.array([1 for x in range(len(normal_abnormal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_normal['become_overwei

In [13]:
# (baseline) abnormal to (1 year) normal 
abnormal_normal = phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_1year'] == 'normal') & (phenotype_loss['BMI_loss'] == 1)]
abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
# (baseline) normal to (1 year) abnormal 
normal_abnormal = phenotype_loss.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_1year'] ==  'underweight') & (phenotype_loss['BMI_loss'] == 1)]
normal_abnormal['become_underweight'] = np.array([1 for x in range(len(normal_abnormal))])

# normal to normal 
normal_normal = phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_1year'] == 'normal') & (phenotype_loss['BMI_loss'] == 0)]
normal_normal['become_normal'] = np.array([0 for x in range(len(normal_normal))])
normal_normal['become_underweight'] = np.array([0 for x in range(len(normal_normal))])


overweight = pd.concat([abnormal_normal, normal_normal])
overweight = overweight.reset_index(drop=True)
overweight = overweight[['subjectkey', 'become_normal']]
underweight = pd.concat([normal_abnormal, normal_normal])
underweight = underweight.reset_index(drop=True)
underweight = underweight[['subjectkey', 'become_underweight']]


new_df_loss = pd.merge(phenotype_loss, overweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_loss = pd.merge(new_df_loss, underweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_loss.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_1years_revised_BMIloss.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_abnormal['become_underweight'] = np.array([1 for x in range(len(normal_abnormal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_normal['become_normal

## For pretraining with BMI and transfer learning to BMI change 

In [14]:
# stratification with propensity score 

from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
import pandas as pd

from psmpy import PsmPy
from psmpy.functions import cohenD

def partition_stratified_train_test(input_df, target, cat_cov:list, num_cov: list,train_size=0.7, val_size=0.1, test_size=0.2, n_percentile=5,n_partition=4,seed=1234):
    def concat_df(df, subject_key): 
        df_concatenated = pd.DataFrame()
        for subj in subject_key: 
            tmp = df[df['subjectkey'] == subj]
            df_concatenated = pd.concat([df_concatenated, tmp])
            
        return df_concatenated

    """
    propensity score stratification ref: https://towardsdatascience.com/psmpy-propensity-score-matching-in-python-a3e0cd4d2631
    
    1) calculae propensity score there by stratify 5 percentile and assign rank 
    2) stratified split train and test data 
    3) stratified split train data to train and validation data 
    4) stratified sampling control data for train, validation and test 
    ##TODO##
    + if subjects who have NaN values in covariates used for calculating propensity scores are assigned to only train set 

    n_partition=5
    train_size = 0.8 
    val_size = 0.125
    test_size = 0.2 
    >>> This is the same as split the whole dataset into train/validation/test with ratio of 0.7/0.1/0.2

    n_partition=4
    train_size = 0.65
    test_size =0.25 
    val_size = 0.13 (parameter for train_test_split)
    >>> This is the same as split the whole dataset into train/validation/test with ratio of 0.65/0.1/0.25
    """
    skf = StratifiedKFold(n_splits=n_partition,random_state=seed, shuffle=True)
    q = np.linspace(0, 1, n_percentile+1)
    labels = np.linspace(0, n_percentile-1, n_percentile)

    # one hot labeling categorical values 
    #ps_df = input_df[['subjectkey', target, 'BMI_sds_change'] + cat_cov + num_cov].copy() 
    ps_df = input_df[['subjectkey', target, 'BMI_sds_baseline'] + cat_cov + num_cov].copy() 
    ps_df_tmp = ps_df.dropna(axis=0).reset_index(drop=True)
    ps_df_cat_cov = pd.get_dummies(ps_df_tmp[cat_cov].astype('category'))
    ps_df_tmp = ps_df_tmp.drop(columns=cat_cov, axis=1)
    ps_df_tmp = pd.concat([ps_df_tmp, ps_df_cat_cov], axis=1)
    

    # PS calculation 
    psm = PsmPy(ps_df_tmp, treatment=target, indx='subjectkey')
    psm.logistic_ps(balance=True)
    ps_df_tmp = psm.predicted_data[['subjectkey','propensity_score']]

    # stratify 4 percentile (test_size = 0.25)
    percentile_class, _ = pd.qcut(ps_df_tmp['propensity_score'].values, q, labels=labels, retbins=True)
    ps_df_tmp['percentile'] = percentile_class
    ps_df_tmp = ps_df_tmp[['subjectkey', 'percentile']]
    
    
    # separate case/control
    case = input_df[input_df[target] == 1].reset_index(drop=True)
    case = pd.merge(case, ps_df_tmp, how='inner', on='subjectkey')
    control = input_df[input_df[target] == 0].reset_index(drop=True)
    control = pd.merge(control, ps_df_tmp, how='inner', on='subjectkey')
    num_total_case = len(case)
    num_total_train = len(control)
    

    # stratify 10 percentile (test_size = 0.1)
    #percentile_class, _ = pd.qcut(case['BMI_sds_change'].values, q, labels=labels, retbins=True)
    #case['percentile'] = percentile_class

    partition_result = {}
    for i, (train_val_idx, test_idx) in enumerate(skf.split(case['subjectkey'], case['percentile'])):
    #for i, (train_val_idx, test_idx) in enumerate(skf.split(case['subjectkey'], case['abcd.site'])):
        # assign samples knocked out by nan filtering for propensity scores to train set.
        case_add = input_df[['subjectkey', target, 'BMI_sds_change'] + cat_cov + num_cov].copy() 
        case_add = case_add[case_add.isna().any(axis=1)]
        case_add = case_add[case_add[target] == 1]
        case_add = case_add[['subjectkey']]
        case_add['partition%s' % i] = ['train' for _ in range(len(case_add))]

        # split train/test
        case_test = case.loc[test_idx]
        case_train_val = case.loc[train_val_idx].reset_index(drop=True)
        control_test = control.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_test)/len(control), random_state=seed))   # stratified random sampling, where len(case_test) == len(control_test) 
        control_test_removed = control.drop(axis=0, index=control_test.index).reset_index(drop=True)
        control_train_val = control_test_removed.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=(len(case_train_val)+len(case_add))/len(control_test_removed), random_state=seed))
        #control_train_val = control_test_removed.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_train_val)/len(control_test_removed)))
        
        
        # split train into train/val
        subjectkey_case_train_val, percentile_case_train_val = case_train_val['subjectkey'].values, case_train_val['percentile'].values
        subjectkey_case_train, subjectkey_case_val, _, _ = train_test_split(subjectkey_case_train_val, percentile_case_train_val, test_size=0.125, shuffle=True, random_state=seed+1, stratify=percentile_case_train_val)
        #subjectkey_case_train, subjectkey_case_val, _, _ = train_test_split(subjectkey_case_train_val, percentile_case_train_val, test_size=0.13,random_state=seed+1, shuffle=True, stratify=percentile_case_train_val)
        case_val = concat_df(case_train_val, subjectkey_case_val)
        case_train = concat_df(case_train_val, subjectkey_case_train)

        
        # stratified sampling control of train/val
        control_val = control_train_val.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_val)/len(control_train_val), random_state=seed))
        control_train = control_train_val.drop(axis=0, index=control_val.index).reset_index(drop=True)
        #control_train = pd.concat([control_train.drop(columns='percentile'), nan_train], axis=0)
        

        
        case_test['partition%s' % i] = ['test' for _ in range(len(case_test))]
        case_val['partition%s' % i] = ['val' for _ in range(len(case_val))]
        case_train['partition%s' % i] = ['train' for _ in range(len(case_train))]
        control_test['partition%s' % i] = ['test' for _ in range(len(control_test))]
        control_val['partition%s' % i] = ['val' for _ in range(len(control_val))]
        control_train['partition%s' % i] = ['train' for _ in range(len(control_train))]

        partition_df = pd.concat([case_train, control_train, case_val, control_val, case_test, control_test])
        partition_df = partition_df[['subjectkey', 'partition%s' % i]]
        partition_df = pd.concat([partition_df, case_add], axis=0)
        
        partition_result['partition%s' % i] = partition_df

    final_df = input_df.drop(columns=cat_cov + num_cov).copy()
    for i in range(n_partition):
        final_df = pd.merge(final_df, partition_result['partition%s' % i], how='left', on='subjectkey')

    return final_df
    




def partition_train_test_baseline(baseline_data, longitudinal_data,  train_size=0.75, val_size=0.05, test_size=0.2, n_partition=4, seed=1234): 
    baseline_data_tmp = baseline_data[['subjectkey','BMI_baseline', 'BMI_sds_baseline']]
    baseline_data_tmp = baseline_data_tmp.dropna(axis=0)

    num_total = len(baseline_data)
    num_train = int(num_total * train_size)
    num_test = int(num_total * test_size)
    num_val  = num_total - num_train - num_test

    phenotype_partition_final = {}
    for i in range(n_partition): 
        longitudinal_data_tmp = longitudinal_data[['subjectkey', 'partition%s' % i ]]
        phenotype_partition = pd.merge(baseline_data_tmp, longitudinal_data_tmp, how='left', left_on='subjectkey', right_on='subjectkey')
        phenotype_partition = phenotype_partition.drop(['BMI_baseline', 'BMI_sds_baseline'], axis=1)
        test_1y = phenotype_partition[phenotype_partition['partition%s' % i] == 'test']
        val_1y = phenotype_partition[phenotype_partition['partition%s' % i] == 'val']
        train_1y = phenotype_partition[phenotype_partition['partition%s' % i] == 'train']

        num_test_tmp = num_test - len(test_1y)
        num_val_tmp = num_val - len(val_1y)
        num_train_tmp = num_train - len(train_1y)

        phenotype_partition_all = phenotype_partition.drop(index=test_1y.index, axis=0).drop(index=val_1y.index, axis=0).drop(index=train_1y.index, axis=0)
        #phenotype_partition_all = phenotype_partition_all.sample(frac=1, random_state=seed).reset_index(drop=True)  # shuffling하고 index reset

        phenotype_partition_train = phenotype_partition_all.iloc[:num_train_tmp]
        phenotype_partition_test = phenotype_partition_all.iloc[num_train_tmp:num_train_tmp+num_test_tmp]
        phenotype_partition_val = phenotype_partition_all.iloc[num_train_tmp+num_test_tmp:]


        phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
        phenotype_partition_test['partition%s' % i] = ['test' for j in range(len(phenotype_partition_test))]
        phenotype_partition_val['partition%s' % i] = ['val' for j in range(len(phenotype_partition_val))]
        
        phenotype_partition_final_tmp = pd.concat([phenotype_partition_train, train_1y, phenotype_partition_val, val_1y, phenotype_partition_test, test_1y])
        phenotype_partition_final['partition%s' % i] = phenotype_partition_final_tmp
        

    partitioned_data_baseline = baseline_data
    for i in range(n_partition): 
        partitioned_data_baseline = pd.merge(partitioned_data_baseline, phenotype_partition_final['partition%s' % i], how='left', left_on='subjectkey', right_on='subjectkey')

    return partitioned_data_baseline



def check_partition(input_df, target='become_overweight', split='train', n_percentile=5, n_partition=4): 
    q = np.linspace(0, 1, n_percentile+1)
    labels = np.linspace(0, n_percentile-1, n_percentile)

    # separate case/control
    case = input_df[input_df[target] == 1].reset_index(drop=True)
    control = input_df[input_df[target] == 0].reset_index(drop=True)
    num_total_case = len(case)
    num_total_train = len(control)

    # stratify 10 percentile (test_size = 0.1)
    percentile_class, _ = pd.qcut(case['BMI_sds_change'].values, q, labels=labels, retbins=True)
    case['percentile'] = percentile_class

    check = pd.merge(input_df, case[['subjectkey', 'percentile']], how='inner', on='subjectkey')

    for i in range(n_partition): 
        tmp = check[check['partition%s' % i] == split]
        print("In partition{}, {} have following counts for each percentile: {}".format(i, split, tmp['percentile'].value_counts().values.tolist()))


In [15]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity', 'married'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_1yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_overweight', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_1yafter, n_partition=5, seed=1234)
partitioned_data_1yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_1years_become_overweight_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_1y_after_become_overweight_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_1yafter, target='become_overweight', split='train', n_percentile=10, n_partition=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ps_df_tmp['percentile'] = percentile_class
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_test['partition%s' % i] = ['test' for 

In partition0, train have following counts for each percentile: [49, 45, 45, 43, 43, 42, 42, 41, 39, 39]
In partition1, train have following counts for each percentile: [52, 47, 46, 45, 43, 42, 41, 38, 38, 36]
In partition2, train have following counts for each percentile: [45, 44, 44, 43, 43, 43, 42, 42, 41, 41]
In partition3, train have following counts for each percentile: [47, 45, 44, 44, 43, 42, 42, 42, 40, 39]
In partition4, train have following counts for each percentile: [47, 47, 45, 44, 43, 43, 43, 41, 39, 36]


In [16]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_1yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_normal', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_1yafter, n_partition=5, seed=1234)
partitioned_data_1yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_1years_become_normal_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_1y_after_become_normal_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_1yafter, target='become_normal', split='train', n_percentile=10, n_partition=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ps_df_tmp['percentile'] = percentile_class
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_test['partition%s' % i] = ['test' for 

In partition0, train have following counts for each percentile: [14, 14, 14, 13, 13, 12, 10, 9, 9, 9]
In partition1, train have following counts for each percentile: [15, 14, 14, 13, 12, 12, 11, 10, 9, 7]
In partition2, train have following counts for each percentile: [14, 13, 13, 13, 12, 12, 11, 10, 10, 9]
In partition3, train have following counts for each percentile: [15, 14, 12, 12, 12, 12, 11, 10, 10, 10]
In partition4, train have following counts for each percentile: [15, 14, 14, 13, 12, 12, 11, 10, 9, 8]


In [17]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_1yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_underweight', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_1yafter, n_partition=5, seed=1234)
partitioned_data_1yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_1years_become_underweight_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_1y_after_become_underweight_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_1yafter, target='become_underweight', split='train', n_percentile=10, n_partition=5)

KeyError: "['become_underweight'] not in index"

In [18]:
num_underweight_underweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_1year'] == 'underweight') & (phenotype_gain['BMI_gain'] == 0)]) 
num_underweight_normal = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_1year'] == 'normal') & (phenotype_gain['BMI_gain'] == 1)]) 
num_normal_normal = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_1year'] == 'normal') & (phenotype_gain['BMI_gain'] == 0)]) 
num_normal_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_1year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)]) 
num_overweight_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'overweight') & (phenotype_gain['BMI_status_1year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 0)])
num_underweight_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_1year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)])

print("underweight -> underweight: {}".format(num_underweight_underweight))
print("underweight -> normal: {}".format(num_underweight_normal))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> overweight: {}".format(num_normal_overweight))
print("overweight -> overweight: {}".format(num_overweight_overweight))
print("underweight -> overweight: {}".format(num_underweight_overweight))

underweight -> underweight: 88
underweight -> normal: 162
normal -> normal: 2538
normal -> overweight: 592
overweight -> overweight: 1995
underweight -> overweight: 6


In [19]:
num_overweight_overweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_1year'] == 'overweight') & (phenotype_loss['BMI_loss'] == 0)])
num_overweight_normal = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_1year'] == 'normal') & (phenotype_loss['BMI_loss'] == 1)])
num_overweight_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_1year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 1)])
num_normal_normal = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_1year'] == 'normal') & (phenotype_loss['BMI_loss'] == 0)]) 
num_normal_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_1year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 1)]) 
num_underweight_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'underweight') & (phenotype_loss['BMI_status_1year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 0)]) 


print("overweight -> overweight: {}".format(num_overweight_overweight))
print("overweight -> normal: {}".format(num_overweight_normal))
print("overweight -> underweight: {}".format(num_overweight_underweight))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> underweight: {}".format(num_normal_underweight))
print("underweight -> underweight: {}".format(num_underweight_underweight))

overweight -> overweight: 1995
overweight -> normal: 353
overweight -> underweight: 1
normal -> normal: 2538
normal -> underweight: 163
underweight -> underweight: 88


In [None]:
sex = phenotype[['subjectkey', 'sex']]
phenotype_gain_sex = pd.merge(phenotype_gain, sex, on='subjectkey', how='inner')
phenotype_loss_sex = pd.merge(phenotype_loss, sex, on='subjectkey', how='inner')


In [None]:
num_underweight_underweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_1year'] == 'underweight') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)]) 
num_underweight_normal = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_1year'] == 'normal') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)]) 
num_normal_normal = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'normal') & (phenotype_gain_sex['BMI_status_1year'] == 'normal') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)]) 
num_normal_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'normal') & (phenotype_gain_sex['BMI_status_1year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)]) 
num_overweight_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'overweight') & (phenotype_gain_sex['BMI_status_1year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)])
num_underweight_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_1year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)])

print("underweight -> underweight: {}".format(num_underweight_underweight))
print("underweight -> normal: {}".format(num_underweight_normal))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> overweight: {}".format(num_normal_overweight))
print("overweight -> overweight: {}".format(num_overweight_overweight))
print("underweight -> overweight: {}".format(num_underweight_overweight))


underweight -> underweight: 33
underweight -> normal: 91
normal -> normal: 1304
normal -> overweight: 335
overweight -> overweight: 1075
underweight -> overweight: 6


In [None]:
num_overweight_overweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_1year'] == 'overweight') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)])
num_overweight_normal = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_1year'] == 'normal') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)])
num_overweight_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_1year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)])
num_normal_normal = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'normal') & (phenotype_loss_sex['BMI_status_1year'] == 'normal') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)]) 
num_normal_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'normal') & (phenotype_loss_sex['BMI_status_1year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)]) 
num_underweight_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'underweight') & (phenotype_loss_sex['BMI_status_1year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)]) 


print("overweight -> overweight: {}".format(num_overweight_overweight))
print("overweight -> normal: {}".format(num_overweight_normal))
print("overweight -> underweight: {}".format(num_overweight_underweight))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> underweight: {}".format(num_normal_underweight))
print("underweight -> underweight: {}".format(num_underweight_underweight))

overweight -> overweight: 1075
overweight -> normal: 197
overweight -> underweight: 0
normal -> normal: 1304
normal -> underweight: 90
underweight -> underweight: 33


## 2 year follow up 

In [4]:
def get_reference_params(subj_age:float, Ref_Chart: pd.DataFrame): 
    for i in range(len(Ref_Chart) - 1): 
        start = Ref_Chart['agemos'][i]
        end = Ref_Chart['agemos'][i+1]
        if subj_age >= start and subj_age < end: 
            if subj_age <= start + 0.5: 
                reference_params = Ref_Chart.iloc[i]
            elif subj_age > start + 0.5: 
                reference_params = Ref_Chart.iloc[i+1]
        else: 
            pass 
    return reference_params  


def sds_calculator(BMI, L, M, S): 
    numerator = (BMI / M) ** L -1   # 분자 
    denominator = L * S     # 분모
    return numerator / denominator



def calculating_subj_BMIsds(subj: pd.Series, Ref_Chart: pd.DataFrame) -> float: 
    age = subj['age_2year']
    BMI = subj['BMI_2year']
    sex = subj['sex']
    sex_Ref_Chart = Ref_Chart[Ref_Chart['sex'] == sex].reset_index(drop=True)
    reference_params = get_reference_params(subj_age=age, Ref_Chart=sex_Ref_Chart)
    subj_BMIsds = sds_calculator(BMI=BMI, L=reference_params['L'], M=reference_params["M"], S=reference_params["S"])

    P5 = reference_params['P5']
    P85 = reference_params['P85']

    if BMI <= P5: 
        status = "underweight"
    elif BMI > P5 and BMI < P85: 
        status = "normal"
    elif P95 > BMI >= P85: 
        status = "overweight"
    return subj_BMIsds, status


In [5]:
Ref_Chart_dir = '/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/bmi-age-2022.csv'
Ref_Chart = pd.read_csv(Ref_Chart_dir)

phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype[['subjectkey','age', 'sex', 'weight','BMI','BMI_sds', 'BMI_status']] 
phenotype_tmp = phenotype_tmp.dropna(axis=0)
phenotype_tmp.rename(columns={'BMI':'BMI_baseline', 'BMI_sds':'BMI_sds_baseline', 'BMI_status': 'BMI_status_baseline', 'weight':'weight_baseline'}, inplace=True)

phenotype2 = pd.read_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD Release4.0 Tabular dataset.csv")
phenotype2 = phenotype2[phenotype2['eventname'] == "2_year_follow_up_y_arm_1"]
phenotype2 = phenotype2.drop(['eventname','sex'], axis=1)
phenotype2.rename(columns={'bmi':'BMI_2year', 'weight':'weight_2year'}, inplace=True)
phenotype2 = phenotype2.reset_index(drop=True)
for i in range(len(phenotype2['subjectkey'].values)):
    phenotype2['subjectkey'].values[i] = phenotype2['subjectkey'].values[i].replace('_','')

phenotype2_tmp = phenotype2[['subjectkey', 'BMI_2year', 'weight_2year']]
phenotype2_tmp = phenotype2_tmp.dropna(axis=0)
phenotype2_tmp = pd.merge(phenotype_tmp, phenotype2_tmp, how='inner', on='subjectkey' )
# add 12 months to baseline age
phenotype2_tmp['age_2year'] = phenotype2_tmp['age'].values + 24 

# calculating BMI sds of each subject
BMIsds = []
BMIstatus = [] 

for i in tqdm(range(len(phenotype2_tmp))): 
    subj_BMIsds, status = calculating_subj_BMIsds(subj = phenotype2_tmp.iloc[i], Ref_Chart=Ref_Chart)
    BMIsds.append(subj_BMIsds)
    BMIstatus.append(status)
    

phenotype2_tmp['BMI_status_2year'] = np.array(BMIstatus)
phenotype2_tmp['BMI_sds_2year'] = np.array(BMIsds) 
phenotype2_tmp['BMI_sds_change'] = phenotype2_tmp['BMI_sds_2year'].values - phenotype2_tmp['BMI_sds_baseline'].values
phenotype2_tmp['BMI_change'] = phenotype2_tmp['BMI_2year'].values - phenotype2_tmp['BMI_baseline'].values
phenotype2_tmp['weight_change'] = phenotype2_tmp['weight_2year'].values - phenotype2_tmp['weight_baseline'].values
phenotype2_tmp = phenotype2_tmp.drop(['age', 'age_2year','sex', 'BMI_2year'], axis=1)
phenotype2_final = pd.merge(phenotype2, phenotype2_tmp, how='left', left_on='subjectkey', right_on='subjectkey')
phenotype2_final = phenotype2_final.loc[(phenotype2_final['BMI_sds_2year'] < 5) & (phenotype2_final['BMI_sds_2year'] > -5)]
phenotype2_final.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_2years_revised_tmp.csv', index=False)







100%|██████████| 7373/7373 [00:10<00:00, 702.26it/s]


In [22]:
gain_template = np.zeros(len(phenotype2_final))
for i in range(len(phenotype2_final)): 
    difference = phenotype2_final['BMI_sds_change'].values[i]
    if difference >= 0.2 :
        gain_template[i] = 1
    elif  difference < 0.2 and difference > -0.2:
        pass
    elif difference <= - 0.2 : 
        gain_template[i] = np.nan


loss_template = np.zeros(len(phenotype2_final))
for i in range(len(phenotype2_final)): 
    difference = phenotype2_final['BMI_sds_change'].values[i]
    if difference >= 0.2 :
        loss_template[i] = np.nan
    elif  difference < 0.2 and difference > -0.2:
        pass
    elif difference <= - 0.2 : 
        loss_template[i] = 1


phenotype2_final['BMI_gain'] = gain_template
phenotype2_final['BMI_loss'] = loss_template

In [23]:
phenotype_gain = phenotype2_final.dropna(axis=0, subset=['BMI_gain'], inplace=False)
phenotype_gain = phenotype_gain.drop(['BMI_loss'], axis=1)
phenotype_gain = phenotype_gain.reset_index(drop=True)
phenotype_gain.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_2years_revised_BMIgain.csv", index=False)

phenotype_loss = phenotype2_final.dropna(axis=0, subset=['BMI_loss'], inplace=False)
phenotype_loss = phenotype_loss.drop(['BMI_gain'], axis=1)
phenotype_loss =phenotype_loss.reset_index(drop=True)
phenotype_loss.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_2years_revised_BMIloss.csv", index=False)




In [24]:
# (baseline) abnormal to (2 year) normal acceleration 
abnormal_normal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_2year'] == 'normal') & (phenotype_gain['BMI_gain'] == 1)]
abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
# (baseline) normal to (2 year) abnormal acceleration 
normal_abnormal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_2year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)]
normal_abnormal['become_overweight'] = np.array([1 for x in range(len(normal_abnormal))])
# normal to normal 
normal_normal = phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_2year'] == 'normal') & (phenotype_gain['BMI_gain'] == 0)]
normal_normal['become_overweight'] = np.array([0 for x in range(len(normal_normal))])
normal_normal['become_normal'] = np.array([0 for x in range(len(normal_normal))])

underweight = pd.concat([abnormal_normal, normal_normal])
underweight = underweight.reset_index(drop=True)
underweight = underweight[['subjectkey', 'become_normal']]
overweight = pd.concat([normal_abnormal, normal_normal])
overweight = overweight.reset_index(drop=True) 
overweight = overweight[['subjectkey', 'become_overweight']]

new_df_gain = pd.merge(phenotype_gain, overweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_gain = pd.merge(new_df_gain, underweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_gain.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_2years_revised_BMIgain.csv", index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_abnormal['become_overweight'] = np.array([1 for x in range(len(normal_abnormal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_normal['become_overwei

In [25]:
# (baseline) abnormal to (2 year) normal 
abnormal_normal = phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_2year'] == 'normal') & (phenotype_loss['BMI_loss'] == 1)]
abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
# (baseline) normal to (2 year) abnormal 
normal_abnormal = phenotype_loss.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_2year'] ==  'underweight') & (phenotype_loss['BMI_loss'] == 1)]
normal_abnormal['become_underweight'] = np.array([1 for x in range(len(normal_abnormal))])

# normal to normal 
normal_normal = phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_2year'] == 'normal') & (phenotype_loss['BMI_loss'] == 0)]
normal_normal['become_normal'] = np.array([0 for x in range(len(normal_normal))])
normal_normal['become_underweight'] = np.array([0 for x in range(len(normal_normal))])


overweight = pd.concat([abnormal_normal, normal_normal])
overweight = overweight.reset_index(drop=True)
overweight = overweight[['subjectkey', 'become_normal']]
underweight = pd.concat([normal_abnormal, normal_normal])
underweight = underweight.reset_index(drop=True)
underweight = underweight[['subjectkey', 'become_underweight']]


new_df_loss = pd.merge(phenotype_loss, overweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_loss = pd.merge(new_df_loss, underweight, how='left', left_on='subjectkey', right_on='subjectkey') 
new_df_loss.to_csv("/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total_2years_revised_BMIloss.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_normal['become_normal'] = np.array([1 for x in range(len(abnormal_normal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_abnormal['become_underweight'] = np.array([1 for x in range(len(normal_abnormal))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_normal['become_normal

## For pretraining with BMI and transfer learning to BMI change 

In [26]:
# stratification with propensity score 

from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
import pandas as pd

from psmpy import PsmPy
from psmpy.functions import cohenD

def partition_stratified_train_test(input_df, target, cat_cov:list, num_cov: list,train_size=0.7, val_size=0.1, test_size=0.2, n_percentile=5,n_partition=4,seed=1234):
    def concat_df(df, subject_key): 
        df_concatenated = pd.DataFrame()
        for subj in subject_key: 
            tmp = df[df['subjectkey'] == subj]
            df_concatenated = pd.concat([df_concatenated, tmp])
            
        return df_concatenated

    """
    propensity score stratification ref: https://towardsdatascience.com/psmpy-propensity-score-matching-in-python-a3e0cd4d2631
    
    1) calculae propensity score there by stratify 5 percentile and assign rank 
    2) stratified split train and test data 
    3) stratified split train data to train and validation data 
    4) stratified sampling control data for train, validation and test 
    ##TODO##
    + if subjects who have NaN values in covariates used for calculating propensity scores are assigned to only train set 

    n_partition=5
    train_size = 0.8 
    val_size = 0.125
    test_size = 0.2 
    >>> This is the same as split the whole dataset into train/validation/test with ratio of 0.7/0.1/0.2

    n_partition=4
    train_size = 0.65
    test_size =0.25 
    val_size = 0.13 (parameter for train_test_split)
    >>> This is the same as split the whole dataset into train/validation/test with ratio of 0.65/0.1/0.25
    """
    skf = StratifiedKFold(n_splits=n_partition,random_state=seed, shuffle=True)
    q = np.linspace(0, 1, n_percentile+1)
    labels = np.linspace(0, n_percentile-1, n_percentile)

    # one hot labeling categorical values 
    #ps_df = input_df[['subjectkey', target, 'BMI_sds_change'] + cat_cov + num_cov].copy() 
    ps_df = input_df[['subjectkey', target, 'BMI_sds_baseline'] + cat_cov + num_cov].copy() 
    ps_df_tmp = ps_df.dropna(axis=0).reset_index(drop=True)
    ps_df_cat_cov = pd.get_dummies(ps_df_tmp[cat_cov].astype('category'))
    ps_df_tmp = ps_df_tmp.drop(columns=cat_cov, axis=1)
    ps_df_tmp = pd.concat([ps_df_tmp, ps_df_cat_cov], axis=1)
    

    # PS calculation 
    psm = PsmPy(ps_df_tmp, treatment=target, indx='subjectkey')
    psm.logistic_ps(balance=True)
    ps_df_tmp = psm.predicted_data[['subjectkey','propensity_score']]

    # stratify 4 percentile (test_size = 0.25)
    percentile_class, _ = pd.qcut(ps_df_tmp['propensity_score'].values, q, labels=labels, retbins=True)
    ps_df_tmp['percentile'] = percentile_class
    ps_df_tmp = ps_df_tmp[['subjectkey', 'percentile']]
    
    
    # separate case/control
    case = input_df[input_df[target] == 1].reset_index(drop=True)
    case = pd.merge(case, ps_df_tmp, how='inner', on='subjectkey')
    control = input_df[input_df[target] == 0].reset_index(drop=True)
    control = pd.merge(control, ps_df_tmp, how='inner', on='subjectkey')
    num_total_case = len(case)
    num_total_train = len(control)
    

    # stratify 10 percentile (test_size = 0.1)
    #percentile_class, _ = pd.qcut(case['BMI_sds_change'].values, q, labels=labels, retbins=True)
    #case['percentile'] = percentile_class

    partition_result = {}
    for i, (train_val_idx, test_idx) in enumerate(skf.split(case['subjectkey'], case['percentile'])):
    #for i, (train_val_idx, test_idx) in enumerate(skf.split(case['subjectkey'], case['abcd.site'])):
        # assign samples knocked out by nan filtering for propensity scores to train set.
        case_add = input_df[['subjectkey', target, 'BMI_sds_change'] + cat_cov + num_cov].copy() 
        case_add = case_add[case_add.isna().any(axis=1)]
        case_add = case_add[case_add[target] == 1]
        case_add = case_add[['subjectkey']]
        case_add['partition%s' % i] = ['train' for _ in range(len(case_add))]

        # split train/test
        case_test = case.loc[test_idx]
        case_train_val = case.loc[train_val_idx].reset_index(drop=True)
        control_test = control.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_test)/len(control), random_state=seed))   # stratified random sampling, where len(case_test) == len(control_test) 
        control_test_removed = control.drop(axis=0, index=control_test.index).reset_index(drop=True)
        control_train_val = control_test_removed.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=(len(case_train_val)+len(case_add))/len(control_test_removed), random_state=seed))
        #control_train_val = control_test_removed.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_train_val)/len(control_test_removed)))
        
        
        # split train into train/val
        subjectkey_case_train_val, percentile_case_train_val = case_train_val['subjectkey'].values, case_train_val['percentile'].values
        subjectkey_case_train, subjectkey_case_val, _, _ = train_test_split(subjectkey_case_train_val, percentile_case_train_val, test_size=0.125, shuffle=True, random_state=seed+1, stratify=percentile_case_train_val)
        #subjectkey_case_train, subjectkey_case_val, _, _ = train_test_split(subjectkey_case_train_val, percentile_case_train_val, test_size=0.13,random_state=seed+1, shuffle=True, stratify=percentile_case_train_val)
        case_val = concat_df(case_train_val, subjectkey_case_val)
        case_train = concat_df(case_train_val, subjectkey_case_train)

        
        # stratified sampling control of train/val
        control_val = control_train_val.groupby('percentile', group_keys=False).apply(lambda x: x.sample(frac=len(case_val)/len(control_train_val), random_state=seed))
        control_train = control_train_val.drop(axis=0, index=control_val.index).reset_index(drop=True)
        #control_train = pd.concat([control_train.drop(columns='percentile'), nan_train], axis=0)
        

        
        case_test['partition%s' % i] = ['test' for _ in range(len(case_test))]
        case_val['partition%s' % i] = ['val' for _ in range(len(case_val))]
        case_train['partition%s' % i] = ['train' for _ in range(len(case_train))]
        control_test['partition%s' % i] = ['test' for _ in range(len(control_test))]
        control_val['partition%s' % i] = ['val' for _ in range(len(control_val))]
        control_train['partition%s' % i] = ['train' for _ in range(len(control_train))]

        partition_df = pd.concat([case_train, control_train, case_val, control_val, case_test, control_test])
        partition_df = partition_df[['subjectkey', 'partition%s' % i]]
        partition_df = pd.concat([partition_df, case_add], axis=0)
        
        partition_result['partition%s' % i] = partition_df

    final_df = input_df.drop(columns=cat_cov + num_cov).copy()
    for i in range(n_partition):
        final_df = pd.merge(final_df, partition_result['partition%s' % i], how='left', on='subjectkey')

    return final_df
    




def partition_train_test_baseline(baseline_data, longitudinal_data,  train_size=0.75, val_size=0.05, test_size=0.2, n_partition=4, seed=1234): 
    baseline_data_tmp = baseline_data[['subjectkey','BMI_baseline', 'BMI_sds_baseline']]
    baseline_data_tmp = baseline_data_tmp.dropna(axis=0)

    num_total = len(baseline_data)
    num_train = int(num_total * train_size)
    num_test = int(num_total * test_size)
    num_val  = num_total - num_train - num_test

    phenotype_partition_final = {}
    for i in range(n_partition): 
        longitudinal_data_tmp = longitudinal_data[['subjectkey', 'partition%s' % i ]]
        phenotype_partition = pd.merge(baseline_data_tmp, longitudinal_data_tmp, how='left', left_on='subjectkey', right_on='subjectkey')
        phenotype_partition = phenotype_partition.drop(['BMI_baseline', 'BMI_sds_baseline'], axis=1)
        test_2y = phenotype_partition[phenotype_partition['partition%s' % i] == 'test']
        val_2y = phenotype_partition[phenotype_partition['partition%s' % i] == 'val']
        train_2y = phenotype_partition[phenotype_partition['partition%s' % i] == 'train']

        num_test_tmp = num_test - len(test_2y)
        num_val_tmp = num_val - len(val_2y)
        num_train_tmp = num_train - len(train_2y)

        phenotype_partition_all = phenotype_partition.drop(index=test_2y.index, axis=0).drop(index=val_2y.index, axis=0).drop(index=train_2y.index, axis=0)
        #phenotype_partition_all = phenotype_partition_all.sample(frac=1, random_state=seed).reset_index(drop=True)  # shuffling하고 index reset

        phenotype_partition_train = phenotype_partition_all.iloc[:num_train_tmp]
        phenotype_partition_test = phenotype_partition_all.iloc[num_train_tmp:num_train_tmp+num_test_tmp]
        phenotype_partition_val = phenotype_partition_all.iloc[num_train_tmp+num_test_tmp:]


        phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
        phenotype_partition_test['partition%s' % i] = ['test' for j in range(len(phenotype_partition_test))]
        phenotype_partition_val['partition%s' % i] = ['val' for j in range(len(phenotype_partition_val))]
        
        phenotype_partition_final_tmp = pd.concat([phenotype_partition_train, train_2y, phenotype_partition_val, val_2y, phenotype_partition_test, test_2y])
        phenotype_partition_final['partition%s' % i] = phenotype_partition_final_tmp
        

    partitioned_data_baseline = baseline_data
    for i in range(n_partition): 
        partitioned_data_baseline = pd.merge(partitioned_data_baseline, phenotype_partition_final['partition%s' % i], how='left', left_on='subjectkey', right_on='subjectkey')

    return partitioned_data_baseline



def check_partition(input_df, target='become_overweight', split='train', n_percentile=5, n_partition=4): 
    q = np.linspace(0, 1, n_percentile+1)
    labels = np.linspace(0, n_percentile-1, n_percentile)

    # separate case/control
    case = input_df[input_df[target] == 1].reset_index(drop=True)
    control = input_df[input_df[target] == 0].reset_index(drop=True)
    num_total_case = len(case)
    num_total_train = len(control)

    # stratify 10 percentile (test_size = 0.1)
    percentile_class, _ = pd.qcut(case['BMI_sds_change'].values, q, labels=labels, retbins=True)
    case['percentile'] = percentile_class

    check = pd.merge(input_df, case[['subjectkey', 'percentile']], how='inner', on='subjectkey')

    for i in range(n_partition): 
        tmp = check[check['partition%s' % i] == split]
        print("In partition{}, {} have following counts for each percentile: {}".format(i, split, tmp['percentile'].value_counts().values.tolist()))


In [31]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_2yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_overweight', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_2yafter, n_partition=5, seed=1234)
partitioned_data_2yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_2years_become_overweight_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_2y_after_become_overweight_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_2yafter, target='become_overweight', split='train', n_percentile=10, n_partition=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ps_df_tmp['percentile'] = percentile_class


In partition0, train have following counts for each percentile: [42, 42, 41, 39, 38, 36, 35, 35, 33, 31]
In partition1, train have following counts for each percentile: [45, 40, 39, 39, 38, 37, 36, 35, 32, 31]
In partition2, train have following counts for each percentile: [44, 41, 41, 39, 37, 35, 34, 34, 34, 33]
In partition3, train have following counts for each percentile: [42, 41, 41, 38, 37, 36, 35, 35, 34, 33]
In partition4, train have following counts for each percentile: [41, 40, 38, 38, 38, 37, 37, 36, 34, 34]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_test['partition%s' % i] = ['test' for j in range(len(phenotype_partition_test))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [28]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_1yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_normal', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_1yafter, n_partition=5, seed=1234)
partitioned_data_1yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_2years_become_normal_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_2y_after_become_normal_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_1yafter, target='become_normal', split='train', n_percentile=10, n_partition=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ps_df_tmp['percentile'] = percentile_class
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_train['partition%s' % i] = ['train' for j in range(len(phenotype_partition_train))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_partition_test['partition%s' % i] = ['test' for 

In partition0, train have following counts for each percentile: [12, 11, 10, 9, 9, 9, 9, 8, 8, 7]
In partition1, train have following counts for each percentile: [12, 11, 10, 10, 9, 9, 8, 8, 8, 7]
In partition2, train have following counts for each percentile: [12, 10, 10, 10, 9, 9, 9, 8, 8, 7]
In partition3, train have following counts for each percentile: [12, 10, 10, 10, 10, 9, 9, 8, 8, 7]
In partition4, train have following counts for each percentile: [12, 11, 10, 10, 9, 9, 9, 8, 8, 7]


In [29]:
cat_cov = ['sex', 'abcd.site', 'race.ethnicity'] 
num_cov = ['age', 'high.educ', 'income'] 
phenotype = pd.read_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/ABCD_phenotype_total.csv')
phenotype_tmp = phenotype.rename(columns={'BMI_sds': 'BMI_sds_baseline', 'BMI': 'BMI_baseline'})
new_df_gain_tmp = new_df_gain.drop(columns=['age', 'income'])   # remove duplicated columns
new_df_gain_tmp = pd.merge(new_df_gain_tmp, phenotype_tmp[['subjectkey'] + cat_cov + num_cov], how='inner', on='subjectkey')
partitioned_data_1yafter = partition_stratified_train_test(new_df_gain_tmp, 'become_underweight', cat_cov=cat_cov, num_cov=num_cov, n_percentile=10, n_partition=5, seed=1234)
partitioned_data_baseline = partition_train_test_baseline(baseline_data=phenotype_tmp , longitudinal_data=partitioned_data_1yafter, n_partition=5, seed=1234)
partitioned_data_1yafter.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_2years_become_underweight_10PS_stratified_partitioned_5fold.csv', index=False)
partitioned_data_baseline.to_csv('/Users/wangheehwan/Desktop/CNN_for_BMI/phenotype_data/10PS/ABCD_phenotype_total_for_pretraining_2y_after_become_underweight_10PS_stratified_5fold.csv', index=False)

check_partition(partitioned_data_1yafter, target='become_underweight', split='train', n_percentile=10, n_partition=5)

KeyError: "['become_underweight'] not in index"

In [753]:
num_underweight_underweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_2year'] == 'underweight') & (phenotype_gain['BMI_gain'] == 0)]) 
num_underweight_normal = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_2year'] == 'normal') & (phenotype_gain['BMI_gain'] == 1)]) 
num_normal_normal = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_2year'] == 'normal') & (phenotype_gain['BMI_gain'] == 0)]) 
num_normal_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'normal') & (phenotype_gain['BMI_status_2year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)]) 
num_overweight_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'overweight') & (phenotype_gain['BMI_status_2year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 0)])
num_underweight_overweight = len(phenotype_gain.loc[(phenotype_gain['BMI_status_baseline'] == 'underweight') & (phenotype_gain['BMI_status_2year'] == 'overweight') & (phenotype_gain['BMI_gain'] == 1)])

print("underweight -> underweight: {}".format(num_underweight_underweight))
print("underweight -> normal: {}".format(num_underweight_normal))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> overweight: {}".format(num_normal_overweight))
print("overweight -> overweight: {}".format(num_overweight_overweight))
print("underweight -> overweight: {}".format(num_underweight_overweight))

underweight -> underweight: 35
underweight -> normal: 128
normal -> normal: 1467
normal -> overweight: 516
overweight -> overweight: 1090
underweight -> overweight: 6


In [754]:

num_overweight_overweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_2year'] == 'overweight') & (phenotype_loss['BMI_loss'] == 0)])
num_overweight_normal = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_2year'] == 'normal') & (phenotype_loss['BMI_loss'] == 1)])
num_overweight_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'overweight') & (phenotype_loss['BMI_status_2year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 1)])
num_normal_normal = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_2year'] == 'normal') & (phenotype_loss['BMI_loss'] == 0)]) 
num_normal_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'normal') & (phenotype_loss['BMI_status_2year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 1)]) 
num_underweight_underweight = len(phenotype_loss.loc[(phenotype_loss['BMI_status_baseline'] == 'underweight') & (phenotype_loss['BMI_status_2year'] == 'underweight') & (phenotype_loss['BMI_loss'] == 0)]) 


print("overweight -> overweight: {}".format(num_overweight_overweight))
print("overweight -> normal: {}".format(num_overweight_normal))
print("overweight -> underweight: {}".format(num_overweight_underweight))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> underweight: {}".format(num_normal_underweight))
print("underweight -> underweight: {}".format(num_underweight_underweight))

overweight -> overweight: 1090
overweight -> normal: 322
overweight -> underweight: 2
normal -> normal: 1467
normal -> underweight: 123
underweight -> underweight: 35


In [787]:
sex = phenotype[['subjectkey', 'sex']]
phenotype_gain_sex = pd.merge(phenotype_gain, sex, on='subjectkey', how='inner')
phenotype_loss_sex = pd.merge(phenotype_loss, sex, on='subjectkey', how='inner')




In [788]:
num_underweight_underweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_2year'] == 'underweight') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)]) 
num_underweight_normal = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_2year'] == 'normal') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)]) 
num_normal_normal = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'normal') & (phenotype_gain_sex['BMI_status_2year'] == 'normal') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)]) 
num_normal_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'normal') & (phenotype_gain_sex['BMI_status_2year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)]) 
num_overweight_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'overweight') & (phenotype_gain_sex['BMI_status_2year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 0) & (phenotype_gain_sex['sex'] == 1)])
num_underweight_overweight = len(phenotype_gain_sex.loc[(phenotype_gain_sex['BMI_status_baseline'] == 'underweight') & (phenotype_gain_sex['BMI_status_2year'] == 'overweight') & (phenotype_gain_sex['BMI_gain'] == 1) & (phenotype_gain_sex['sex'] == 1)])

print("underweight -> underweight: {}".format(num_underweight_underweight))
print("underweight -> normal: {}".format(num_underweight_normal))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> overweight: {}".format(num_normal_overweight))
print("overweight -> overweight: {}".format(num_overweight_overweight))
print("underweight -> overweight: {}".format(num_underweight_overweight))

underweight -> underweight: 20
underweight -> normal: 65
normal -> normal: 787
normal -> overweight: 270
overweight -> overweight: 604
underweight -> overweight: 4


In [789]:

num_overweight_overweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_2year'] == 'overweight') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)])
num_overweight_normal = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_2year'] == 'normal') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)])
num_overweight_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'overweight') & (phenotype_loss_sex['BMI_status_2year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)])
num_normal_normal = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'normal') & (phenotype_loss_sex['BMI_status_2year'] == 'normal') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)]) 
num_normal_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'normal') & (phenotype_loss_sex['BMI_status_2year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 1) & (phenotype_loss_sex['sex'] == 1)]) 
num_underweight_underweight = len(phenotype_loss_sex.loc[(phenotype_loss_sex['BMI_status_baseline'] == 'underweight') & (phenotype_loss_sex['BMI_status_2year'] == 'underweight') & (phenotype_loss_sex['BMI_loss'] == 0) & (phenotype_loss_sex['sex'] == 1)]) 


print("overweight -> overweight: {}".format(num_overweight_overweight))
print("overweight -> normal: {}".format(num_overweight_normal))
print("overweight -> underweight: {}".format(num_overweight_underweight))
print("normal -> normal: {}".format(num_normal_normal))
print("normal -> underweight: {}".format(num_normal_underweight))
print("underweight -> underweight: {}".format(num_underweight_underweight))

overweight -> overweight: 604
overweight -> normal: 171
overweight -> underweight: 2
normal -> normal: 787
normal -> underweight: 84
underweight -> underweight: 20
