In [1]:
import sys
import pandas as pd
sys.path.insert(0, ".././0.data-download/scripts/")
from data_loader import load_data
from scipy.stats import ttest_ind
import pathlib

In [2]:
latent_df = pd.read_csv("../2.train-VAE/results/latent_df.csv")
metadata_df = pd.read_csv(".././0.data-download/data/metadata_df.csv")
data_dir = "../0.data-download/data/"
model_df, dependency_df = load_data(data_dir, adult_or_pediatric="all")


In [3]:
# Creating categorized lists of sample IDs used in BVAE training
# note that 10 of the 912 used samples have Unknown Sex

ped_ids = metadata_df.query("AgeCategory == 'Pediatric'").ModelID.tolist()
adult_ids = metadata_df.query("AgeCategory == 'Adult'").ModelID.tolist()
male_ids = metadata_df.query("Sex == 'Male'").ModelID.tolist()
female_ids = metadata_df.query("Sex == 'Female'").ModelID.tolist()
ped_male_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Male'").ModelID.tolist()
adult_male_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Male'").ModelID.tolist()
ped_female_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Female'").ModelID.tolist()
adult_female_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Female'").ModelID.tolist()

In [4]:
# Generating latent dataframes for each category and dropping the id column to prep for t tests

adult_latent_df = latent_df.copy()
for index, row in adult_latent_df.iterrows():
    if row['ModelID'] not in adult_ids:
        adult_latent_df.drop(index, inplace=True)
adult_latent_df_float = adult_latent_df.drop(columns=["ModelID"])
adult_latent_df_float.reset_index(drop=True, inplace=True)

ped_latent_df = latent_df.copy()
for index, row in ped_latent_df.iterrows():
    if row['ModelID'] not in ped_ids:
        ped_latent_df.drop(index, inplace=True)
ped_latent_df_float = ped_latent_df.drop(columns=["ModelID"])
ped_latent_df_float.reset_index(drop=True, inplace=True)

male_latent_df = latent_df.copy()
for index, row in male_latent_df.iterrows():
    if row['ModelID'] not in male_ids:
        male_latent_df.drop(index, inplace=True)
male_latent_df_float = male_latent_df.drop(columns=["ModelID"])
male_latent_df_float.reset_index(drop=True, inplace=True)

female_latent_df = latent_df.copy()
for index, row in female_latent_df.iterrows():
    if row['ModelID'] not in female_ids:
        female_latent_df.drop(index, inplace=True)
female_latent_df_float = female_latent_df.drop(columns=["ModelID"])
female_latent_df_float.reset_index(drop=True, inplace=True)

ped_male_latent_df = latent_df.copy()
for index, row in ped_male_latent_df.iterrows():
    if row['ModelID'] not in ped_male_ids:
        ped_male_latent_df.drop(index, inplace=True)
ped_male_latent_df_float = ped_male_latent_df.drop(columns=["ModelID"])
ped_male_latent_df_float.reset_index(drop=True, inplace=True)

adult_male_latent_df = latent_df.copy()
for index, row in adult_male_latent_df.iterrows():
    if row['ModelID'] not in adult_male_ids:
        adult_male_latent_df.drop(index, inplace=True)
adult_male_latent_df_float = adult_male_latent_df.drop(columns=["ModelID"])
adult_male_latent_df_float.reset_index(drop=True, inplace=True)

ped_female_latent_df = latent_df.copy()
for index, row in ped_female_latent_df.iterrows():
    if row['ModelID'] not in ped_female_ids:
        ped_female_latent_df.drop(index, inplace=True)
ped_female_latent_df_float = ped_female_latent_df.drop(columns=["ModelID"])
ped_female_latent_df_float.reset_index(drop=True, inplace=True)

adult_female_latent_df = latent_df.copy()
for index, row in adult_female_latent_df.iterrows():
    if row['ModelID'] not in adult_female_ids:
        adult_female_latent_df.drop(index, inplace=True)
adult_female_latent_df_float = adult_female_latent_df.drop(columns=["ModelID"])
adult_female_latent_df_float.reset_index(drop=True, inplace=True)

In [5]:
# t tests comparing adult vs ped for each latent dimension

t_test_adult_vs_ped = ttest_ind(adult_latent_df_float, ped_latent_df_float)
t_test_adult_vs_ped = pd.DataFrame(t_test_adult_vs_ped).T
t_test_adult_vs_ped.columns = ["t_stat", "p_value"]
t_test_adult_vs_ped['comparison'] = 'Adult vs Pediatric'
t_test_adult_vs_ped['latent feature'] = t_test_adult_vs_ped.index + 1
print(t_test_adult_vs_ped.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_vs_ped.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,2.619217,0.00895984,Adult vs Pediatric,1
1,-1.024201,0.3060126,Adult vs Pediatric,2
2,-4.715474,2.789251e-06,Adult vs Pediatric,3
3,4.627759,4.2328e-06,Adult vs Pediatric,4
4,5.356734,1.073884e-07,Adult vs Pediatric,5


In [6]:
# t tests comparing male vs female for each latent dimension

t_test_male_vs_female = ttest_ind(male_latent_df_float, female_latent_df_float)
t_test_male_vs_female = pd.DataFrame(t_test_male_vs_female).T
t_test_male_vs_female.columns = ["t_stat", "p_value"]
t_test_male_vs_female['comparison'] = 'Male vs Female'
t_test_male_vs_female['latent feature'] = t_test_male_vs_female.index + 1
print(t_test_male_vs_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_male_vs_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,0.55339,0.580134,Male vs Female,1
1,-0.194792,0.8456,Male vs Female,2
2,0.637202,0.524156,Male vs Female,3
3,-2.263743,0.023827,Male vs Female,4
4,1.028306,0.304082,Male vs Female,5


In [7]:
# t tests comparing adult male vs ped male for each latent dimension

t_test_adult_male_vs_ped_male = ttest_ind(ped_male_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_ped_male = pd.DataFrame(t_test_adult_male_vs_ped_male).T
t_test_adult_male_vs_ped_male.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_ped_male['comparison'] = 'Adult Male vs Pediatric Male'
t_test_adult_male_vs_ped_male['latent feature'] = t_test_adult_male_vs_ped_male.index + 1
print(t_test_adult_male_vs_ped_male.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_male_vs_ped_male.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-0.846383,0.397733,Adult Male vs Pediatric Male,1
1,0.455944,0.648623,Adult Male vs Pediatric Male,2
2,2.570098,0.010448,Adult Male vs Pediatric Male,3
3,-3.058151,0.002343,Adult Male vs Pediatric Male,4
4,-3.84529,0.000136,Adult Male vs Pediatric Male,5


In [8]:
# t tests comparing adult female vs ped female for each latent dimension

t_test_adult_female_vs_ped_female = ttest_ind(ped_female_latent_df_float, adult_female_latent_df_float)
t_test_adult_female_vs_ped_female = pd.DataFrame(t_test_adult_female_vs_ped_female).T
t_test_adult_female_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_adult_female_vs_ped_female['comparison'] = 'Adult Female vs Pediatric Female'
t_test_adult_female_vs_ped_female['latent feature'] = t_test_adult_female_vs_ped_female.index + 1
print(t_test_adult_female_vs_ped_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_female_vs_ped_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-2.940364,0.003477,Adult Female vs Pediatric Female,1
1,0.964662,0.335321,Adult Female vs Pediatric Female,2
2,3.991287,7.9e-05,Adult Female vs Pediatric Female,3
3,-3.177277,0.001607,Adult Female vs Pediatric Female,4
4,-3.321331,0.000982,Adult Female vs Pediatric Female,5


In [9]:
# t tests comparing ped male vs ped female for each latent dimension

t_test_ped_male_vs_ped_female = ttest_ind(ped_female_latent_df_float, ped_male_latent_df_float)
t_test_ped_male_vs_ped_female = pd.DataFrame(t_test_ped_male_vs_ped_female).T
t_test_ped_male_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_ped_male_vs_ped_female['comparison'] = 'Pediatric Male vs Pediatric Female'
t_test_ped_male_vs_ped_female['latent feature'] = t_test_ped_male_vs_ped_female.index + 1
print(t_test_ped_male_vs_ped_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_ped_male_vs_ped_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-2.106411,0.037121,Pediatric Male vs Pediatric Female,1
1,0.44536,0.656812,Pediatric Male vs Pediatric Female,2
2,0.826377,0.410128,Pediatric Male vs Pediatric Female,3
3,0.469335,0.639629,Pediatric Male vs Pediatric Female,4
4,-0.298767,0.765602,Pediatric Male vs Pediatric Female,5


In [10]:
# t tests comparing adult male vs adult female for each latent dimension

t_test_adult_male_vs_adult_female = ttest_ind(adult_female_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_adult_female = pd.DataFrame(t_test_adult_male_vs_adult_female).T
t_test_adult_male_vs_adult_female.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_adult_female['comparison'] = 'Adult Male vs Adult Female'
t_test_adult_male_vs_adult_female['latent feature'] = t_test_adult_male_vs_adult_female.index + 1
print(t_test_adult_male_vs_adult_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_male_vs_adult_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,0.131214,0.89564,Adult Male vs Adult Female,1
1,0.024031,0.980834,Adult Male vs Adult Female,2
2,-1.022803,0.306722,Adult Male vs Adult Female,3
3,2.231559,0.025931,Adult Male vs Adult Female,4
4,-1.022106,0.307052,Adult Male vs Adult Female,5


In [11]:
# Combining and saving t test results
t_test_results_df = pd.concat([t_test_adult_vs_ped, t_test_male_vs_female, t_test_adult_male_vs_ped_male, t_test_adult_female_vs_ped_female, t_test_ped_male_vs_ped_female, t_test_adult_male_vs_adult_female])
t_test_results_dir = pathlib.Path("./results/t_test_results.tsv")
t_test_results_df.to_csv(t_test_results_dir, sep="\t")

t_test_results_df.sort_values(by='p_value', ascending = True)

Unnamed: 0,t_stat,p_value,comparison,latent feature
43,-8.045795,2.661993e-15,Adult vs Pediatric,44
47,6.619739,6.140800e-11,Adult vs Pediatric,48
33,-6.013628,2.624606e-09,Adult vs Pediatric,34
48,5.734514,1.330195e-08,Adult vs Pediatric,49
43,5.699702,2.024621e-08,Adult Male vs Pediatric Male,44
...,...,...,...,...
1,0.024031,9.808340e-01,Adult Male vs Adult Female,2
31,0.022519,9.820396e-01,Adult Male vs Adult Female,32
40,0.020838,9.833791e-01,Male vs Female,41
44,-0.010618,9.915310e-01,Adult Male vs Adult Female,45
