In [1]:
import sys
import pandas as pd
sys.path.insert(0, ".././0.data-download/scripts/")
from data_loader import load_data
from scipy.stats import ttest_ind
import pathlib

In [2]:
latent_df = pd.read_csv("../2.train-VAE/results/latent_df.csv")
metadata_df = pd.read_csv(".././0.data-download/data/metadata_df.csv")
data_dir = "../0.data-download/data/"
model_df, dependency_df = load_data(data_dir, adult_or_pediatric="all")


In [3]:
# Creating categorized lists of sample IDs used in BVAE training
# note that 10 of the 912 used samples have Unknown Sex

ped_ids = metadata_df.query("AgeCategory == 'Pediatric'").ModelID.tolist()
adult_ids = metadata_df.query("AgeCategory == 'Adult'").ModelID.tolist()
male_ids = metadata_df.query("Sex == 'Male'").ModelID.tolist()
female_ids = metadata_df.query("Sex == 'Female'").ModelID.tolist()
ped_male_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Male'").ModelID.tolist()
adult_male_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Male'").ModelID.tolist()
ped_female_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Female'").ModelID.tolist()
adult_female_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Female'").ModelID.tolist()

In [4]:
# Generating latent dataframes for each category and dropping the id column to prep for t tests

adult_latent_df = latent_df.copy()
for index, row in adult_latent_df.iterrows():
    if row['ModelID'] not in adult_ids:
        adult_latent_df.drop(index, inplace=True)
adult_latent_df_float = adult_latent_df.drop(columns=["ModelID"])
adult_latent_df_float.reset_index(drop=True, inplace=True)

ped_latent_df = latent_df.copy()
for index, row in ped_latent_df.iterrows():
    if row['ModelID'] not in ped_ids:
        ped_latent_df.drop(index, inplace=True)
ped_latent_df_float = ped_latent_df.drop(columns=["ModelID"])
ped_latent_df_float.reset_index(drop=True, inplace=True)

male_latent_df = latent_df.copy()
for index, row in male_latent_df.iterrows():
    if row['ModelID'] not in male_ids:
        male_latent_df.drop(index, inplace=True)
male_latent_df_float = male_latent_df.drop(columns=["ModelID"])
male_latent_df_float.reset_index(drop=True, inplace=True)

female_latent_df = latent_df.copy()
for index, row in female_latent_df.iterrows():
    if row['ModelID'] not in female_ids:
        female_latent_df.drop(index, inplace=True)
female_latent_df_float = female_latent_df.drop(columns=["ModelID"])
female_latent_df_float.reset_index(drop=True, inplace=True)

ped_male_latent_df = latent_df.copy()
for index, row in ped_male_latent_df.iterrows():
    if row['ModelID'] not in ped_male_ids:
        ped_male_latent_df.drop(index, inplace=True)
ped_male_latent_df_float = ped_male_latent_df.drop(columns=["ModelID"])
ped_male_latent_df_float.reset_index(drop=True, inplace=True)

adult_male_latent_df = latent_df.copy()
for index, row in adult_male_latent_df.iterrows():
    if row['ModelID'] not in adult_male_ids:
        adult_male_latent_df.drop(index, inplace=True)
adult_male_latent_df_float = adult_male_latent_df.drop(columns=["ModelID"])
adult_male_latent_df_float.reset_index(drop=True, inplace=True)

ped_female_latent_df = latent_df.copy()
for index, row in ped_female_latent_df.iterrows():
    if row['ModelID'] not in ped_female_ids:
        ped_female_latent_df.drop(index, inplace=True)
ped_female_latent_df_float = ped_female_latent_df.drop(columns=["ModelID"])
ped_female_latent_df_float.reset_index(drop=True, inplace=True)

adult_female_latent_df = latent_df.copy()
for index, row in adult_female_latent_df.iterrows():
    if row['ModelID'] not in adult_female_ids:
        adult_female_latent_df.drop(index, inplace=True)
adult_female_latent_df_float = adult_female_latent_df.drop(columns=["ModelID"])
adult_female_latent_df_float.reset_index(drop=True, inplace=True)

In [5]:
# t tests comparing adult vs ped for each latent dimension

t_test_adult_vs_ped = ttest_ind(adult_latent_df_float, ped_latent_df_float)
t_test_adult_vs_ped = pd.DataFrame(t_test_adult_vs_ped).T
t_test_adult_vs_ped.columns = ["t_stat", "p_value"]
t_test_adult_vs_ped['comparison'] = 'Adult vs Pediatric'
t_test_adult_vs_ped['latent feature'] = t_test_adult_vs_ped.index + 1
print(t_test_adult_vs_ped.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_vs_ped.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,4.204976,2.869108e-05,Adult vs Pediatric,1
1,6.661994,4.671631e-11,Adult vs Pediatric,2
2,-2.733295,0.006392053,Adult vs Pediatric,3
3,0.103723,0.917412,Adult vs Pediatric,4
4,7.630505,5.892318e-14,Adult vs Pediatric,5


In [6]:
# t tests comparing male vs female for each latent dimension

t_test_male_vs_female = ttest_ind(male_latent_df_float, female_latent_df_float)
t_test_male_vs_female = pd.DataFrame(t_test_male_vs_female).T
t_test_male_vs_female.columns = ["t_stat", "p_value"]
t_test_male_vs_female['comparison'] = 'Male vs Female'
t_test_male_vs_female['latent feature'] = t_test_male_vs_female.index + 1
print(t_test_male_vs_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_male_vs_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-2.198056,0.028199,Male vs Female,1
1,-0.547054,0.584477,Male vs Female,2
2,0.175154,0.860998,Male vs Female,3
3,-0.83798,0.402265,Male vs Female,4
4,-0.287226,0.774005,Male vs Female,5


In [7]:
# t tests comparing adult male vs ped male for each latent dimension

t_test_adult_male_vs_ped_male = ttest_ind(ped_male_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_ped_male = pd.DataFrame(t_test_adult_male_vs_ped_male).T
t_test_adult_male_vs_ped_male.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_ped_male['comparison'] = 'Adult Male vs Pediatric Male'
t_test_adult_male_vs_ped_male['latent feature'] = t_test_adult_male_vs_ped_male.index + 1
print(t_test_adult_male_vs_ped_male.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_male_vs_ped_male.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-1.925206,0.05475501,Adult Male vs Pediatric Male,1
1,-5.099418,4.798707e-07,Adult Male vs Pediatric Male,2
2,3.187022,0.001524894,Adult Male vs Pediatric Male,3
3,0.843206,0.3995054,Adult Male vs Pediatric Male,4
4,-6.210699,1.089312e-09,Adult Male vs Pediatric Male,5


In [8]:
# t tests comparing adult female vs ped female for each latent dimension

t_test_adult_female_vs_ped_female = ttest_ind(ped_female_latent_df_float, adult_female_latent_df_float)
t_test_adult_female_vs_ped_female = pd.DataFrame(t_test_adult_female_vs_ped_female).T
t_test_adult_female_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_adult_female_vs_ped_female['comparison'] = 'Adult Female vs Pediatric Female'
t_test_adult_female_vs_ped_female['latent feature'] = t_test_adult_female_vs_ped_female.index + 1
print(t_test_adult_female_vs_ped_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_female_vs_ped_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-4.017857,7.1e-05,Adult Female vs Pediatric Female,1
1,-3.903957,0.000112,Adult Female vs Pediatric Female,2
2,-0.21099,0.833007,Adult Female vs Pediatric Female,3
3,-0.601081,0.548141,Adult Female vs Pediatric Female,4
4,-3.783167,0.00018,Adult Female vs Pediatric Female,5


In [9]:
# t tests comparing ped male vs ped female for each latent dimension

t_test_ped_male_vs_ped_female = ttest_ind(ped_female_latent_df_float, ped_male_latent_df_float)
t_test_ped_male_vs_ped_female = pd.DataFrame(t_test_ped_male_vs_ped_female).T
t_test_ped_male_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_ped_male_vs_ped_female['comparison'] = 'Pediatric Male vs Pediatric Female'
t_test_ped_male_vs_ped_female['latent feature'] = t_test_ped_male_vs_ped_female.index + 1
print(t_test_ped_male_vs_ped_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_ped_male_vs_ped_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,-0.92474,0.356841,Pediatric Male vs Pediatric Female,1
1,0.834518,0.405544,Pediatric Male vs Pediatric Female,2
2,-1.942552,0.054266,Pediatric Male vs Pediatric Female,3
3,-0.563845,0.573846,Pediatric Male vs Pediatric Female,4
4,1.655913,0.100189,Pediatric Male vs Pediatric Female,5


In [10]:
# t tests comparing adult male vs adult female for each latent dimension

t_test_adult_male_vs_adult_female = ttest_ind(adult_female_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_adult_female = pd.DataFrame(t_test_adult_male_vs_adult_female).T
t_test_adult_male_vs_adult_female.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_adult_female['comparison'] = 'Adult Male vs Adult Female'
t_test_adult_male_vs_adult_female['latent feature'] = t_test_adult_male_vs_adult_female.index + 1
print(t_test_adult_male_vs_adult_female.shape)

# Sort to show latent dimensions with most significant p values
t_test_adult_male_vs_adult_female.head(5)

(56, 4)


Unnamed: 0,t_stat,p_value,comparison,latent feature
0,2.635971,0.008559,Adult Male vs Adult Female,1
1,0.283528,0.776849,Adult Male vs Adult Female,2
2,0.714323,0.475244,Adult Male vs Adult Female,3
3,1.175411,0.240194,Adult Male vs Adult Female,4
4,-0.266579,0.789865,Adult Male vs Adult Female,5


In [11]:
# Combining and saving t test results
t_test_results_df = pd.concat([t_test_adult_vs_ped, t_test_male_vs_female, t_test_adult_male_vs_ped_male, t_test_adult_female_vs_ped_female, t_test_ped_male_vs_ped_female, t_test_adult_male_vs_adult_female])
t_test_results_dir = pathlib.Path("./results/t_test_results.tsv")
t_test_results_df.to_csv(t_test_results_dir, sep="\t")

t_test_results_df.sort_values(by='p_value', ascending = True)

Unnamed: 0,t_stat,p_value,comparison,latent feature
4,7.630505,5.892318e-14,Adult vs Pediatric,5
1,6.661994,4.671631e-11,Adult vs Pediatric,2
4,-6.210699,1.089312e-09,Adult Male vs Pediatric Male,5
38,-5.705936,1.564675e-08,Adult vs Pediatric,39
36,5.361181,1.048564e-07,Adult vs Pediatric,37
...,...,...,...,...
37,0.057482,9.541744e-01,Male vs Female,38
40,0.050751,9.595371e-01,Adult Male vs Adult Female,41
29,0.048483,9.613419e-01,Male vs Female,30
50,-0.041505,9.669151e-01,Adult Female vs Pediatric Female,51
