In [1]:
import sys
import pandas as pd
sys.path.insert(0, ".././0.data-download/scripts/")
from data_loader import load_data
from scipy.stats import ttest_ind 
from scipy.stats import f_oneway
import pathlib

In [2]:
latent_df = pd.read_parquet("../2.train-VAE/results/latent_df.parquet")
metadata_df = pd.read_parquet(".././0.data-download/data/metadata_df.parquet")
data_dir = "../0.data-download/data/"
model_df, dependency_df = load_data(data_dir, adult_or_pediatric="all")


In [3]:
# Creating categorized lists of sample IDs used in BVAE training
# note that 10 of the 912 used samples have Unknown Sex

ped_ids = metadata_df.query("AgeCategory == 'Pediatric'").ModelID.tolist()
adult_ids = metadata_df.query("AgeCategory == 'Adult'").ModelID.tolist()
male_ids = metadata_df.query("Sex == 'Male'").ModelID.tolist()
female_ids = metadata_df.query("Sex == 'Female'").ModelID.tolist()
ped_male_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Male'").ModelID.tolist()
adult_male_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Male'").ModelID.tolist()
ped_female_ids = metadata_df.query("AgeCategory == 'Pediatric'").query("Sex == 'Female'").ModelID.tolist()
adult_female_ids = metadata_df.query("AgeCategory == 'Adult'").query("Sex == 'Female'").ModelID.tolist()

In [4]:
# Generating latent dataframes for each category and dropping the id column to prep for t tests

adult_latent_df = latent_df.copy()
for index, row in adult_latent_df.iterrows():
    if row['ModelID'] not in adult_ids:
        adult_latent_df.drop(index, inplace=True)
adult_latent_df_float = adult_latent_df.drop(columns=["ModelID"])
adult_latent_df_float.reset_index(drop=True, inplace=True)

ped_latent_df = latent_df.copy()
for index, row in ped_latent_df.iterrows():
    if row['ModelID'] not in ped_ids:
        ped_latent_df.drop(index, inplace=True)
ped_latent_df_float = ped_latent_df.drop(columns=["ModelID"])
ped_latent_df_float.reset_index(drop=True, inplace=True)

male_latent_df = latent_df.copy()
for index, row in male_latent_df.iterrows():
    if row['ModelID'] not in male_ids:
        male_latent_df.drop(index, inplace=True)
male_latent_df_float = male_latent_df.drop(columns=["ModelID"])
male_latent_df_float.reset_index(drop=True, inplace=True)

female_latent_df = latent_df.copy()
for index, row in female_latent_df.iterrows():
    if row['ModelID'] not in female_ids:
        female_latent_df.drop(index, inplace=True)
female_latent_df_float = female_latent_df.drop(columns=["ModelID"])
female_latent_df_float.reset_index(drop=True, inplace=True)

ped_male_latent_df = latent_df.copy()
for index, row in ped_male_latent_df.iterrows():
    if row['ModelID'] not in ped_male_ids:
        ped_male_latent_df.drop(index, inplace=True)
ped_male_latent_df_float = ped_male_latent_df.drop(columns=["ModelID"])
ped_male_latent_df_float.reset_index(drop=True, inplace=True)

adult_male_latent_df = latent_df.copy()
for index, row in adult_male_latent_df.iterrows():
    if row['ModelID'] not in adult_male_ids:
        adult_male_latent_df.drop(index, inplace=True)
adult_male_latent_df_float = adult_male_latent_df.drop(columns=["ModelID"])
adult_male_latent_df_float.reset_index(drop=True, inplace=True)

ped_female_latent_df = latent_df.copy()
for index, row in ped_female_latent_df.iterrows():
    if row['ModelID'] not in ped_female_ids:
        ped_female_latent_df.drop(index, inplace=True)
ped_female_latent_df_float = ped_female_latent_df.drop(columns=["ModelID"])
ped_female_latent_df_float.reset_index(drop=True, inplace=True)

adult_female_latent_df = latent_df.copy()
for index, row in adult_female_latent_df.iterrows():
    if row['ModelID'] not in adult_female_ids:
        adult_female_latent_df.drop(index, inplace=True)
adult_female_latent_df_float = adult_female_latent_df.drop(columns=["ModelID"])
adult_female_latent_df_float.reset_index(drop=True, inplace=True)

In [5]:
# t tests comparing adult vs ped for each latent dimension

t_test_adult_vs_ped = ttest_ind(adult_latent_df_float, ped_latent_df_float)
t_test_adult_vs_ped = pd.DataFrame(t_test_adult_vs_ped).T
t_test_adult_vs_ped.columns = ["t_stat", "p_value"]
t_test_adult_vs_ped['comparison'] = 'Adult vs Pediatric'
t_test_adult_vs_ped['latent_feature'] = t_test_adult_vs_ped.index + 1
print(t_test_adult_vs_ped.shape)

t_test_adult_vs_ped.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,1.108148,0.268091,Adult vs Pediatric,1
1,-0.330403,0.741171,Adult vs Pediatric,2
2,1.75693,0.079266,Adult vs Pediatric,3
3,1.339843,0.180631,Adult vs Pediatric,4
4,-0.682239,0.495262,Adult vs Pediatric,5


In [6]:
# t tests comparing male vs female for each latent dimension

t_test_male_vs_female = ttest_ind(male_latent_df_float, female_latent_df_float)
t_test_male_vs_female = pd.DataFrame(t_test_male_vs_female).T
t_test_male_vs_female.columns = ["t_stat", "p_value"]
t_test_male_vs_female['comparison'] = 'Male vs Female'
t_test_male_vs_female['latent_feature'] = t_test_male_vs_female.index + 1
print(t_test_male_vs_female.shape)

t_test_male_vs_female.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,1.108652,0.267877,Male vs Female,1
1,0.407183,0.683971,Male vs Female,2
2,-0.831059,0.406161,Male vs Female,3
3,0.545271,0.585702,Male vs Female,4
4,-0.030552,0.975633,Male vs Female,5


In [7]:
# t tests comparing adult male vs ped male for each latent dimension

t_test_adult_male_vs_ped_male = ttest_ind(ped_male_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_ped_male = pd.DataFrame(t_test_adult_male_vs_ped_male).T
t_test_adult_male_vs_ped_male.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_ped_male['comparison'] = 'Adult Male vs Pediatric Male'
t_test_adult_male_vs_ped_male['latent_feature'] = t_test_adult_male_vs_ped_male.index + 1
print(t_test_adult_male_vs_ped_male.shape)

t_test_adult_male_vs_ped_male.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,-1.543336,0.123365,Adult Male vs Pediatric Male,1
1,-0.047483,0.962147,Adult Male vs Pediatric Male,2
2,-1.177276,0.23963,Adult Male vs Pediatric Male,3
3,-0.467525,0.640323,Adult Male vs Pediatric Male,4
4,1.013726,0.31119,Adult Male vs Pediatric Male,5


In [8]:
# t tests comparing adult female vs ped female for each latent dimension

t_test_adult_female_vs_ped_female = ttest_ind(ped_female_latent_df_float, adult_female_latent_df_float)
t_test_adult_female_vs_ped_female = pd.DataFrame(t_test_adult_female_vs_ped_female).T
t_test_adult_female_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_adult_female_vs_ped_female['comparison'] = 'Adult Female vs Pediatric Female'
t_test_adult_female_vs_ped_female['latent_feature'] = t_test_adult_female_vs_ped_female.index + 1
print(t_test_adult_female_vs_ped_female.shape)

t_test_adult_female_vs_ped_female.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,0.495861,0.620276,Adult Female vs Pediatric Female,1
1,1.071492,0.284621,Adult Female vs Pediatric Female,2
2,-1.168067,0.243504,Adult Female vs Pediatric Female,3
3,-1.554113,0.120981,Adult Female vs Pediatric Female,4
4,-0.237204,0.812625,Adult Female vs Pediatric Female,5


In [9]:
# t tests comparing ped male vs ped female for each latent dimension

t_test_ped_male_vs_ped_female = ttest_ind(ped_female_latent_df_float, ped_male_latent_df_float)
t_test_ped_male_vs_ped_female = pd.DataFrame(t_test_ped_male_vs_ped_female).T
t_test_ped_male_vs_ped_female.columns = ["t_stat", "p_value"]
t_test_ped_male_vs_ped_female['comparison'] = 'Pediatric Male vs Pediatric Female'
t_test_ped_male_vs_ped_female['latent_feature'] = t_test_ped_male_vs_ped_female.index + 1
print(t_test_ped_male_vs_ped_female.shape)

t_test_ped_male_vs_ped_female.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,0.972899,0.332438,Pediatric Male vs Pediatric Female,1
1,0.678333,0.498785,Pediatric Male vs Pediatric Female,2
2,0.270846,0.786946,Pediatric Male vs Pediatric Female,3
3,-1.071879,0.285791,Pediatric Male vs Pediatric Female,4
4,-0.759248,0.4491,Pediatric Male vs Pediatric Female,5


In [10]:
# t tests comparing adult male vs adult female for each latent dimension

t_test_adult_male_vs_adult_female = ttest_ind(adult_female_latent_df_float, adult_male_latent_df_float)
t_test_adult_male_vs_adult_female = pd.DataFrame(t_test_adult_male_vs_adult_female).T
t_test_adult_male_vs_adult_female.columns = ["t_stat", "p_value"]
t_test_adult_male_vs_adult_female['comparison'] = 'Adult Male vs Adult Female'
t_test_adult_male_vs_adult_female['latent_feature'] = t_test_adult_male_vs_adult_female.index + 1
print(t_test_adult_male_vs_adult_female.shape)

t_test_adult_male_vs_adult_female.head(5)

(49, 4)


Unnamed: 0,t_stat,p_value,comparison,latent_feature
0,-1.527894,0.12695,Adult Male vs Adult Female,1
1,-0.691442,0.489496,Adult Male vs Adult Female,2
2,0.779642,0.435841,Adult Male vs Adult Female,3
3,-0.171361,0.863985,Adult Male vs Adult Female,4
4,0.348414,0.727624,Adult Male vs Adult Female,5


In [11]:
# Combining and saving t test results
t_test_results_df = pd.concat([
    t_test_adult_vs_ped, 
    t_test_male_vs_female, 
    t_test_adult_male_vs_ped_male, 
    t_test_adult_female_vs_ped_female, 
    t_test_ped_male_vs_ped_female, 
    t_test_adult_male_vs_adult_female
]).reset_index(drop=True)
t_test_results_dir = pathlib.Path("./results/t_test_results.tsv")
t_test_results_df.to_parquet(t_test_results_dir, sep="\t")

# sort to show most significant p-values
t_test_results_df.sort_values(by='p_value', ascending = True)

Unnamed: 0,t_stat,p_value,comparison,latent_feature
19,-10.166017,4.545863e-23,Adult vs Pediatric,20
117,7.530158,2.292888e-13,Adult Male vs Pediatric Male,20
166,5.966613,5.506578e-09,Adult Female vs Pediatric Female,20
20,5.728777,1.374337e-08,Adult vs Pediatric,21
46,4.905672,1.102733e-06,Adult vs Pediatric,47
...,...,...,...,...
227,0.025412,9.797658e-01,Pediatric Male vs Pediatric Female,32
5,-0.020261,9.838399e-01,Adult vs Pediatric,6
256,0.020238,9.838588e-01,Adult Male vs Adult Female,12
24,0.017214,9.862699e-01,Adult vs Pediatric,25


In [12]:
# ANOVA Testing
f_statistic, p_value = f_oneway(adult_male_latent_df_float, ped_male_latent_df_float, adult_female_latent_df_float, ped_female_latent_df_float)
anova_df = pd.DataFrame({'f_stat': f_statistic.tolist(), 'p_value': p_value.tolist()})
anova_df['latent_feature'] = anova_df.index + 1
anova_df


Unnamed: 0,f_stat,p_value,latent_feature
0,1.300784,0.2729084,1
1,0.448869,0.7181584,2
2,1.137888,0.3327461,3
3,0.993349,0.395221,4
4,0.358296,0.7831501,5
5,0.444308,0.7213889,6
6,2.977962,0.03072441,7
7,2.445115,0.06265755,8
8,4.896127,0.002215578,9
9,4.147913,0.006231379,10
