In [1]:
import sys
import pandas as pd
sys.path.insert(0, ".././0.data-download/scripts/")
from data_loader import load_data
from scipy.stats import ttest_ind

In [2]:
latent_df = pd.read_csv("../2.train-VAE/results/latent_df.csv")
metadata_df = pd.read_csv(".././0.data-download/data/metadata_df.csv")
data_dir = "../0.data-download/data/"
model_df, dependency_df = load_data(data_dir, adult_or_pediatric="all")


In [3]:
# Creating categorized lists of sample IDs used in BVAE training
# note that 10 of the 912 used samples have Unknown Sex

ped_ids = []
adult_ids = []
male_ids = []
female_ids = []

for index, row in metadata_df.iterrows():
    if row['AgeCategory'] == 'Pediatric':
        ped_ids.append(row['ModelID'])
    
    if row['AgeCategory'] == 'Adult':
        adult_ids.append(row['ModelID'])
    
    if row['Sex'] == 'Male':
        male_ids.append(row['ModelID'])

    if row['Sex'] == 'Female':
        female_ids.append(row['ModelID'])


In [4]:
# Generating latent dataframes for each category and dropping the id column to prep for t tests

adult_latent_df = latent_df.copy()
for index, row in adult_latent_df.iterrows():
    if row['ModelID'] not in adult_ids:
        adult_latent_df.drop(index, inplace=True)
adult_latent_df_float = adult_latent_df.drop(columns=["ModelID"])
adult_latent_df_float.reset_index(drop=True, inplace=True)


ped_latent_df = latent_df.copy()
for index, row in ped_latent_df.iterrows():
    if row['ModelID'] not in ped_ids:
        ped_latent_df.drop(index, inplace=True)
ped_latent_df_float = ped_latent_df.drop(columns=["ModelID"])
ped_latent_df_float.reset_index(drop=True, inplace=True)


male_latent_df = latent_df.copy()
for index, row in male_latent_df.iterrows():
    if row['ModelID'] not in male_ids:
        male_latent_df.drop(index, inplace=True)
male_latent_df_float = male_latent_df.drop(columns=["ModelID"])
male_latent_df_float.reset_index(drop=True, inplace=True)


female_latent_df = latent_df.copy()
for index, row in female_latent_df.iterrows():
    if row['ModelID'] not in female_ids:
        female_latent_df.drop(index, inplace=True)
female_latent_df_float = female_latent_df.drop(columns=["ModelID"])
female_latent_df_float.reset_index(drop=True, inplace=True)

In [5]:
# t tests comparing adult vs ped for each latent dimension

t_test_adult_vs_ped = ttest_ind(adult_latent_df_float, ped_latent_df_float)
t_test_adult_vs_ped = pd.DataFrame(t_test_adult_vs_ped).T
t_test_adult_vs_ped.columns = ["t_stat", "p_value"]
print(t_test_adult_vs_ped.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_vs_ped.sort_values(by='p_value', ascending = True)

(70, 2)


Unnamed: 0,t_stat,p_value
23,3.422550,0.000648
2,3.302834,0.000994
61,3.128999,0.001810
62,2.643559,0.008345
1,-2.540298,0.011241
...,...,...
30,0.073079,0.941760
27,-0.063380,0.949478
29,-0.053562,0.957296
34,0.021316,0.982999


In [6]:
# t tests comparing male vs female for each latent dimension

t_test_male_vs_female = ttest_ind(male_latent_df_float, female_latent_df_float)
t_test_male_vs_female = pd.DataFrame(t_test_male_vs_female).T
t_test_male_vs_female.columns = ["t_stat", "p_value"]
print(t_test_male_vs_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_male_vs_female.sort_values(by='p_value', ascending = True)

(70, 2)


Unnamed: 0,t_stat,p_value
66,2.941344,0.003352
62,-2.623810,0.008842
36,2.616775,0.009025
57,2.088592,0.037025
42,2.077654,0.038024
...,...,...
4,-0.107486,0.914427
55,-0.060815,0.951520
23,0.056670,0.954821
20,-0.055038,0.956121
