In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1145    True
1146    True
1147    True
1148    True
1149    True
Name: ID_allignment_verify, Length: 1150, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000189,0.040863,0.071758,-0.047450,0.182228,0.189317,0.059994,-0.015742,-0.055965,-0.021898,...,-0.653405,0.381142,0.058374,-0.188926,-0.131952,-0.443036,-0.095665,-0.047367,-0.209471,Unknown_Male
1,ACH-000704,-0.139223,0.091242,-0.029407,0.468500,-0.350253,-0.070558,0.242550,0.104058,-0.153588,...,-0.484513,-0.034321,0.057002,-0.448039,-0.207007,-0.184679,0.135320,-0.223829,-0.242149,Adult_Female
2,ACH-000588,-0.066143,-0.145404,0.133368,0.042014,-0.142049,-0.043838,-0.096894,-1.067020,0.014830,...,-0.146197,0.140023,0.144580,-0.054174,-0.065591,0.159398,-0.116524,-0.056546,-0.152592,Adult_Male
3,ACH-001513,-0.033326,-0.215079,0.027560,0.070146,-0.390449,0.020722,0.054319,0.032775,0.072222,...,-0.286548,-0.020320,0.207239,-0.064271,-0.098768,-0.081727,-0.026597,-0.204215,-0.330286,Unknown_Female
4,ACH-000288,-0.126616,-0.095345,0.148311,0.092488,-0.226140,0.009658,0.071833,-0.271225,-0.043396,...,-0.091948,0.137504,0.246098,-0.194281,0.052051,-0.025941,-0.472545,-0.244110,-0.392614,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-000953,-0.094178,-0.047219,-0.054693,0.082400,-0.003112,-0.061223,-0.017879,-0.287321,0.108760,...,-0.401136,-0.037833,0.011425,-0.156019,-0.114237,-0.119522,-0.142186,-0.165600,0.061860,Pediatric_Male
1146,ACH-001402,-0.219824,-0.056013,-0.033793,0.170722,-0.048435,-0.198198,0.067420,0.072234,-0.096077,...,-0.067355,0.134976,0.190439,-0.088172,0.081297,-0.013322,-0.040583,-0.171905,-0.184679,Adult_Female
1147,ACH-001366,-0.102851,-0.200555,0.067527,0.024745,-0.058113,-0.084786,-0.000684,-0.478434,-0.095380,...,-0.235367,0.058601,-0.118827,0.027934,0.131925,-0.293123,-0.000691,-0.219405,-0.126215,Pediatric_Male
1148,ACH-000599,-0.077665,-0.048738,0.164596,-0.028439,-0.107566,0.099942,0.136515,-0.176653,0.028969,...,0.201926,-0.051700,0.017344,-0.002998,0.051127,-0.094165,0.113024,-0.193831,-0.403005,Adult_Female


In [6]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [7]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [8]:
# split the data based on age category and sex
train_df, tv_df = train_test_split(
    PA_effect_df, test_size=0.3, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
tv_df.reset_index(drop=True,inplace=True)
test_df, val_df = train_test_split(
    PA_effect_df, test_size=0.5, stratify=PA_effect_df.age_and_sex
)
test_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)

In [9]:
# preparing train dataframe to be scaled
col_num = train_df.shape[1]
train_scaled_df = train_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled_df = scaler.fit_transform(train_scaled_df)

# adding id column and age and sex column back
train_scaled_df = pd.DataFrame(train_scaled_df)
train_scaled_df.insert(0, train_df.columns[0], train_df[train_df.columns[0]])
train_scaled_df.insert(col_num-1, train_df.columns[col_num-1], train_df[train_df.columns[col_num-1]])
train_scaled_df.columns = train_df.columns
train_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000219,0.331316,0.766773,0.563285,0.645413,0.571949,0.532450,0.539986,0.683005,0.531730,...,0.392991,0.390743,0.591428,0.497727,0.351756,0.629411,0.414447,0.492106,0.568021,Adult_Female
1,ACH-002500,0.381849,0.614516,0.535611,0.675017,0.380622,0.551060,0.534669,0.263396,0.566985,...,0.547671,0.393210,0.657969,0.037354,0.397975,0.640427,0.341559,0.723469,0.673008,Adult_Male
2,ACH-000975,0.487362,0.643343,0.562191,0.537953,0.638435,0.540325,0.552319,0.635579,0.622134,...,0.582829,0.424076,0.488668,0.371161,0.305035,0.429234,0.522071,0.309762,0.568842,Adult_Male
3,ACH-001548,0.505928,0.561787,0.438622,0.761560,0.459322,0.523826,0.544929,0.696024,0.726669,...,0.614458,0.625569,0.588409,0.499031,0.289634,0.580242,0.650436,0.486286,0.359370,Pediatric_Female
4,ACH-000630,0.360392,0.684942,0.475775,0.520644,0.614742,0.527191,0.503224,0.595370,0.599653,...,0.453379,0.449327,0.584593,0.396885,0.171289,0.580497,0.524527,0.564690,0.600832,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,ACH-000520,0.243759,0.641931,0.578366,0.627724,0.657436,0.511192,0.763344,0.674203,0.614261,...,0.460616,0.285227,0.258553,0.322151,0.308799,0.326823,0.440991,0.515628,0.671717,Adult_Female
666,ACH-000488,0.529010,0.629859,0.513111,0.674501,0.522685,0.471464,0.574962,0.733710,0.630862,...,0.377931,0.553109,0.603710,0.577333,0.398894,0.549812,0.674605,0.561501,0.627674,Adult_Male
667,ACH-001959,0.400092,0.725103,0.396122,0.486299,0.687073,0.531392,0.583996,0.655997,0.550200,...,0.650794,0.506777,0.626002,0.376535,0.403750,0.518704,0.404468,0.492593,0.458063,Adult_Male
668,ACH-002799,0.393761,0.703553,0.477158,0.482834,0.543616,0.558704,0.399430,0.759824,0.538266,...,0.514439,0.417532,0.616848,0.519332,0.250877,0.550030,0.495995,0.703998,0.705808,Adult_Female


In [10]:
# preparing test dataframe to be scaled
col_num = test_df.shape[1]
test_scaled_df = test_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
test_scaled_df = scaler.fit_transform(test_scaled_df)

# adding id column and age and sex column back
test_scaled_df = pd.DataFrame(test_scaled_df)
test_scaled_df.insert(0, test_df.columns[0], test_df[test_df.columns[0]])
test_scaled_df.insert(col_num-1, test_df.columns[col_num-1], test_df[test_df.columns[col_num-1]])
test_scaled_df.columns = test_df.columns
test_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002510,0.429391,0.618771,0.497248,0.508374,0.451534,0.569466,0.411559,0.723150,0.450065,...,0.505849,0.508846,0.415492,0.388982,0.089407,0.646033,0.305436,0.403096,0.523743,Adult_Male
1,ACH-001653,0.614961,0.593533,0.596204,0.758134,0.581860,0.477063,0.583968,0.523608,0.735482,...,0.309279,0.622765,0.811493,0.506752,0.294936,0.608772,0.682806,0.529457,0.301022,Adult_Male
2,ACH-001375,0.551191,0.625370,0.605999,0.635287,0.283753,0.435995,0.718984,0.636883,0.583373,...,0.456697,0.669915,0.747114,0.270310,0.234769,0.824424,0.596879,0.280046,0.439490,Adult_Male
3,ACH-002052,0.683171,0.572530,0.501612,0.697240,0.417842,0.376692,0.422725,0.647382,0.725645,...,0.753829,0.552536,0.702679,0.283466,0.493598,0.575460,0.495393,0.543772,0.386024,Adult_Male
4,ACH-000804,0.396965,0.577281,0.344654,0.637995,0.732549,0.251418,0.659607,0.360457,0.706558,...,0.489945,0.830440,0.601878,0.470044,0.203354,0.736939,0.442620,0.576673,0.605088,Pediatric_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,ACH-000507,0.562650,0.590651,0.555307,0.712581,0.520470,0.257638,0.678831,0.829229,0.767583,...,0.455708,0.513190,0.642376,0.315772,0.204392,0.603235,0.720058,0.706159,0.788391,Adult_Male
475,ACH-000766,0.627815,0.560810,0.480453,0.638320,0.631385,0.366265,0.607394,0.897218,0.670430,...,0.719127,0.517364,0.673869,0.467957,0.231986,0.531791,0.588884,0.332884,0.527124,Adult_Male
476,ACH-001852,0.505396,0.587538,0.501078,0.681393,0.497761,0.489856,0.552961,0.765788,0.645788,...,0.356101,0.705770,0.619083,0.360558,0.252720,0.647367,0.319373,0.398034,0.465938,Adult_Male
477,ACH-000592,0.645956,0.600689,0.469887,0.523815,0.475948,0.569761,0.629132,0.247892,0.676400,...,0.655774,0.502493,0.515963,0.456465,0.184060,0.675006,0.491921,0.355511,0.497521,Adult_Female


In [11]:
# preparing validation dataframe to be scaled
col_num = val_df.shape[1]
val_scaled_df = val_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
val_scaled_df = scaler.fit_transform(val_scaled_df)

# adding id column and age and sex column back
val_scaled_df = pd.DataFrame(val_scaled_df)
val_scaled_df.insert(0, val_df.columns[0], val_df[val_df.columns[0]])
val_scaled_df.insert(col_num-1, val_df.columns[col_num-1], val_df[val_df.columns[col_num-1]])
val_scaled_df.columns = val_df.columns
val_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002297,0.690291,0.147884,0.643761,0.490292,0.540273,0.443104,0.366217,0.326839,0.813416,...,0.534929,0.459141,0.569664,0.470152,0.763811,0.527648,0.544833,0.473590,0.668449,Adult_Male
1,ACH-002027,0.453523,0.630532,0.727168,0.717604,0.536081,0.484243,0.478523,0.665665,0.555410,...,0.288186,0.431026,0.691112,0.607137,0.645036,0.533708,0.506059,0.384365,0.424560,Adult_Female
2,ACH-000416,0.565505,0.477158,0.558747,0.523781,0.572815,0.558304,0.521380,0.707157,0.660695,...,0.403758,0.564696,0.538362,0.545940,0.636446,0.629346,0.458898,0.582500,0.524809,Adult_Male
3,ACH-001550,0.491275,0.513714,0.789908,0.340405,0.341726,0.652559,0.657108,0.366193,0.569705,...,0.525168,0.438890,0.737786,0.562631,0.534645,0.606257,0.516596,0.730422,0.617379,Adult_Female
4,ACH-000749,0.430361,0.270662,0.662942,0.596266,0.734584,0.645569,0.621504,0.558700,0.716052,...,0.518948,0.796427,0.589368,0.773296,0.157763,0.526280,0.135728,0.771495,0.560734,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,ACH-000296,0.626803,0.462116,0.167044,0.506094,0.625687,0.446367,0.642167,0.559854,0.000000,...,0.435922,0.716915,0.542557,0.607666,0.292966,0.252491,0.527677,0.615528,0.690755,Adult_Male
475,ACH-000077,0.291388,0.453346,0.413555,0.488630,0.697625,0.667107,0.086290,0.496703,0.542468,...,0.507364,0.779356,0.591742,0.425229,0.789673,0.649029,0.536399,0.609884,0.643047,Adult_Male
476,ACH-001702,0.490055,0.492987,0.492619,0.473501,0.568056,0.606935,0.369562,0.566841,0.629133,...,0.643507,0.415048,0.451766,0.497233,0.716705,0.627266,0.541300,0.412978,0.549731,Adult_Male
477,ACH-000137,0.556117,0.389062,0.486545,0.572162,0.569903,0.572108,0.504769,0.547935,0.781060,...,0.336432,0.546274,0.532036,0.457451,0.743553,0.600972,0.625300,0.678575,0.507052,Adult_Female


In [12]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(479, 17109)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002510,-0.171075,-0.064572,0.015456,-0.025326,-0.178867,-0.015712,-0.145463,0.055289,-0.226674,...,-0.467587,-0.082958,-0.14499,-0.121399,-0.232473,-0.104405,-0.230893,-0.260563,-0.278387,Adult_Male
1,ACH-001653,0.025509,-0.095235,0.095557,0.137333,-0.048077,-0.093858,0.004841,-0.172107,0.043836,...,-0.88345,0.020944,0.268797,-0.019273,-0.043766,-0.144874,0.106395,-0.147093,-0.562209,Adult_Male
2,ACH-001375,-0.042046,-0.056553,0.103485,0.057328,-0.347244,-0.128589,0.122546,-0.04302,-0.100329,...,-0.571572,0.063948,0.201527,-0.224307,-0.099009,0.089342,0.029594,-0.371059,-0.385754,Adult_Male


In [13]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(670, 17109)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000219,-0.146514,0.043773,0.06891,0.136053,-0.105771,-0.122924,-0.016658,-0.084992,-0.106335,...,-0.700276,-0.06937,0.038849,-0.026109,0.008404,-0.071498,-0.128821,-0.180633,-0.281503,Adult_Female
1,ACH-002500,-0.102689,-0.127026,0.04651,0.156969,-0.31912,-0.104164,-0.021458,-0.688309,-0.070074,...,-0.370644,-0.067448,0.108378,-0.42442,0.050841,-0.059745,-0.193223,0.027126,-0.133243,Adult_Male
2,ACH-000975,-0.011182,-0.094688,0.068024,0.060131,-0.031633,-0.114986,-0.005521,-0.153181,-0.013352,...,-0.295719,-0.043401,-0.068527,-0.135613,-0.034493,-0.285065,-0.033728,-0.344375,-0.280343,Adult_Male


In [14]:
# save the VALIDATION dataframe
val_df = val_df.reset_index(drop=True)
val_df_output = pathlib.Path("../0.data-download/data/VAE_val_df.csv")
val_df.to_csv(val_df_output, index=False)
print(val_df.shape)
val_df.head(3)

(479, 17109)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002297,0.044859,-0.407669,0.105487,0.087772,-0.201549,-0.223252,-0.079988,-0.533137,0.138735,...,-0.323001,-0.016083,0.075687,-0.096047,0.12288,-0.115786,-0.013616,-0.199033,-0.104694,Adult_Male
1,ACH-002027,-0.115223,0.06112,0.155195,0.257564,-0.205755,-0.182735,0.00484,0.020326,-0.188372,...,-0.809141,-0.037987,0.185418,0.022354,0.04198,-0.108583,-0.047875,-0.27481,-0.461872,Adult_Female
2,ACH-000416,-0.03951,-0.087849,0.054821,0.112786,-0.168903,-0.109792,0.037211,0.088103,-0.054889,...,-0.581438,0.066152,0.047405,-0.030541,0.036129,0.005103,-0.089545,-0.106538,-0.315057,Adult_Male
