In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, dependency_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and dependency_df are alligned
model_df["ID_allignment_verify"] = np.where(
    dependency_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1090    True
1091    True
1092    True
1093    True
1094    True
Name: ID_allignment_verify, Length: 1095, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the dependency dataframe as a single column
presplit_dependency_df = dependency_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_dependency_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000585,0.013448,0.027507,0.013616,0.011703,0.083605,0.031614,0.034987,0.188184,0.045492,...,0.383911,0.013649,0.008057,0.141308,0.063247,0.303770,0.058027,0.106323,0.243198,Adult_Male
1,ACH-001367,0.050310,0.023929,0.058586,0.009547,0.077919,0.118300,0.001689,0.225383,0.039909,...,0.343845,0.005532,0.007036,0.068239,0.017691,0.156433,0.029322,0.018091,0.061729,Pediatric_Female
2,ACH-000277,0.031423,0.044504,0.108249,0.012896,0.306100,0.022002,0.011045,0.030541,0.010792,...,0.209547,0.009343,0.053262,0.038517,0.027383,0.161739,0.015542,0.310834,0.981133,Adult_Female
3,ACH-000036,0.062550,0.037485,0.019980,0.002631,0.105692,0.023648,0.002573,0.583354,0.060502,...,0.744510,0.054228,0.021329,0.048110,0.035060,0.044812,0.030461,0.082698,0.142263,Adult_Male
4,ACH-000504,0.048973,0.072150,0.017993,0.014182,0.032479,0.030063,0.007680,0.014123,0.065259,...,0.147402,0.052102,0.039286,0.022687,0.060170,0.056828,0.144134,0.138456,0.217908,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-000817,0.098257,0.039604,0.008577,0.004886,0.011891,0.019286,0.010640,0.181150,0.005494,...,0.826429,0.003754,0.005503,0.006332,0.039717,0.056421,0.005764,0.013003,0.147311,Adult_Male
1091,ACH-000884,0.030959,0.050191,0.022724,0.002884,0.014746,0.102112,0.008458,0.117924,0.010606,...,0.969871,0.005524,0.026006,0.013820,0.004143,0.030470,0.058204,0.017159,0.299665,Adult_Unknown
1092,ACH-001648,0.024832,0.013953,0.006293,0.002758,0.144811,0.065932,0.009982,0.066401,0.055291,...,0.833605,0.028620,0.001858,0.035467,0.027458,0.033650,0.062256,0.011235,0.467222,Unknown_Male
1093,ACH-002024,0.034834,0.073666,0.015884,0.012089,0.320159,0.010910,0.012909,0.046827,0.018749,...,0.511510,0.071961,0.035139,0.041807,0.038234,0.143786,0.096655,0.089709,0.342843,Adult_Male


In [6]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [7]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_dependency_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_dependency_IDs) & set(presplit_dependency_df["ModelID"].tolist())

# creating a new gene dependency data frame containing correlating ModelIDs to the filtered sample info IDs
PA_dependency_df = presplit_dependency_df.loc[
    presplit_dependency_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [8]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_dependency_df, test_size=0.15, stratify=PA_dependency_df.age_and_sex
)

In [9]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(137, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000685,0.031966,0.033223,0.088601,0.000508,0.02965,0.014328,0.02998,0.176074,0.024215,...,0.413714,0.011722,0.002489,0.070755,0.06734,0.083171,0.017293,0.226098,0.608009,Adult_Female
1,ACH-000656,0.04014,0.003204,0.022192,0.03432,0.135123,0.230762,0.007966,0.341448,0.059626,...,0.019812,0.019469,0.011236,0.084156,0.054804,0.018443,0.007783,0.089235,0.267848,Adult_Male
2,ACH-000159,0.053875,0.075826,0.013374,0.02329,0.025995,0.027518,0.008083,0.155916,0.025576,...,0.934046,0.018941,0.03156,0.088209,0.115445,0.152708,0.020325,0.072903,0.291191,Adult_Male


In [10]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(775, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002016,0.025749,0.026939,0.003954,0.00834,0.050778,0.054314,0.009736,0.078431,0.089257,...,0.538992,0.07177,0.01117,0.02649,0.019766,0.420239,0.039556,0.062614,0.605033,Adult_Female
1,ACH-002459,0.111789,0.016396,0.002874,0.001118,0.120582,0.035094,0.008382,0.244338,0.022202,...,0.522505,0.058566,0.007411,0.035491,0.036429,0.125641,0.020571,0.18079,0.215714,Adult_Male
2,ACH-002084,0.034169,0.00944,0.014166,0.00323,0.084395,0.111543,0.00837,0.728247,0.150858,...,0.64542,0.013886,0.001169,0.070238,0.039434,0.116044,0.076024,0.087134,0.048226,Adult_Female
