In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1090    True
1091    True
1092    True
1093    True
1094    True
Name: ID_allignment_verify, Length: 1095, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000243,0.062093,-0.122680,0.022447,0.062609,-0.120638,-0.031200,0.051486,-0.215581,0.146592,...,-1.415975,0.060913,-0.014226,0.020623,-0.056737,-0.287151,0.014139,-0.053860,-0.520472,Adult_Female
1,ACH-000488,0.034365,-0.075193,0.082607,0.207276,-0.143493,-0.198507,0.076041,-0.217543,0.034932,...,-0.958616,0.038262,0.071764,0.060094,0.071433,-0.199927,0.057789,-0.029485,-0.299387,Adult_Male
2,ACH-000406,-0.047100,-0.050624,0.026235,0.148909,-0.178170,-0.132196,0.048005,-0.785721,-0.001919,...,-0.524403,-0.024767,-0.028895,-0.113299,-0.001215,-0.184424,-0.045890,-0.152570,-0.352044,Adult_Male
3,ACH-001329,-0.000029,-0.025736,0.018315,0.150399,-0.174877,-0.063467,0.102451,-0.642390,0.006129,...,-1.094399,-0.012171,-0.120752,0.037483,-0.159076,-0.183410,-0.060646,-0.025912,-0.460575,Adult_Male
4,ACH-000213,-0.103370,-0.181475,0.061701,0.153299,-0.124094,-0.041442,-0.060516,-0.166065,-0.141599,...,-0.609426,0.075557,0.201109,0.055394,0.279735,-0.323746,0.042320,0.026596,-0.669347,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-002143,-0.002627,-0.058912,0.045844,-0.098134,-0.120169,0.014008,0.167357,-0.144859,0.124595,...,-0.407631,-0.112569,0.269199,-0.090563,0.202411,-0.473727,-0.150585,-0.094197,-0.101860,Adult_Male
1091,ACH-000638,0.028292,-0.037660,0.020515,0.129438,-0.051519,-0.034863,0.177753,-0.321326,0.204855,...,-0.564037,-0.096969,-0.022444,0.156490,0.160487,-0.162975,0.009049,0.041649,-0.245419,Adult_Male
1092,ACH-001991,-0.112859,-0.085770,0.092360,0.240241,0.039671,0.091037,-0.006593,-0.088666,0.061333,...,-0.380271,-0.124708,-0.035420,0.104014,0.093573,-0.290950,-0.314785,-0.180759,-0.102392,Adult_Female
1093,ACH-000330,0.005731,-0.250152,0.145553,0.124974,-0.094794,0.119367,0.180975,-0.850589,-0.011627,...,-0.699033,0.155148,0.161595,-0.179940,-0.119523,-0.214386,-0.300549,-0.219147,-0.390224,Adult_Female


In [6]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [7]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [8]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_effect_df, test_size=0.15, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [9]:
# preparing train dataframe to be scaled
col_num = train_df.shape[1]
train_scaled_df = train_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled_df = scaler.fit_transform(train_scaled_df)

# adding id column and age and sex column back
train_scaled_df = pd.DataFrame(train_scaled_df)
train_scaled_df.insert(0, train_df.columns[0], train_df[train_df.columns[0]])
train_scaled_df.insert(col_num-1, train_df.columns[col_num-1], train_df[train_df.columns[col_num-1]])
train_scaled_df.columns = train_df.columns
train_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001820,0.629138,0.748388,0.610856,0.633592,0.724294,0.611027,0.752395,0.729050,0.608912,...,0.800248,0.278997,0.604541,0.589066,0.471774,0.775926,0.770280,0.520910,0.000000,Adult_Female
1,ACH-000632,0.507548,0.588598,0.456349,0.302933,0.517958,0.722199,0.504000,0.630415,0.673835,...,0.358546,0.565163,0.725677,0.325397,0.469110,0.753382,0.465649,0.714535,0.570080,Adult_Male
2,ACH-000403,0.504747,0.632802,0.581050,0.328703,0.674954,0.680468,0.630995,0.571472,0.633701,...,0.515912,0.543477,0.651013,0.282764,0.270957,0.693680,0.484792,0.671991,0.533998,Adult_Male
3,ACH-002233,0.458047,0.415609,0.312898,0.450757,0.715136,0.512631,0.595881,0.874014,0.695336,...,0.303846,0.755938,0.774648,0.558572,0.550129,0.525479,0.624878,0.474018,0.941588,Adult_Female
4,ACH-001852,0.467436,0.617756,0.545345,0.493476,0.608837,0.555865,0.506380,0.737768,0.665259,...,0.329480,0.632318,0.633607,0.403175,0.303803,0.663786,0.313425,0.523917,0.495668,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,ACH-000721,0.411177,0.581148,0.526470,0.407556,0.687591,0.572474,0.460709,0.638156,0.650599,...,0.462793,0.521498,0.616275,0.483160,0.281492,0.614673,0.545992,0.628903,0.552468,Adult_Female
771,ACH-001028,0.481801,0.594359,0.513517,0.446131,0.614691,0.517051,0.484556,0.725633,0.664357,...,0.477581,0.483738,0.518278,0.420561,0.309565,0.641677,0.565646,0.532515,0.422864,Pediatric_Female
772,ACH-000469,0.377135,0.571722,0.518828,0.615808,0.590839,0.365679,0.392063,0.534344,0.635847,...,0.395479,0.635017,0.723620,0.507454,0.320783,0.698054,0.542422,0.567782,0.424373,Adult_Male
773,ACH-000368,0.442788,0.493795,0.418128,0.530113,0.645208,0.713027,0.424385,0.721183,0.597296,...,0.513291,0.659949,0.679053,0.445447,0.333390,0.701017,0.425925,0.721056,0.530169,Adult_Male


In [10]:
# preparing test dataframe to be scaled
col_num = test_df.shape[1]
test_scaled_df = test_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
test_scaled_df = scaler.fit_transform(test_scaled_df)

# adding id column and age and sex column back
test_scaled_df = pd.DataFrame(test_scaled_df)
test_scaled_df.insert(0, test_df.columns[0], test_df[test_df.columns[0]])
test_scaled_df.insert(col_num-1, test_df.columns[col_num-1], test_df[test_df.columns[col_num-1]])
test_scaled_df.columns = test_df.columns
test_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000348,0.482714,0.640701,0.402414,0.598216,0.310665,0.403122,0.438282,0.645628,0.446424,...,0.588241,0.513876,0.266302,0.701826,0.339859,0.483437,0.511397,0.783653,0.652811,Adult_Female
1,ACH-001041,0.278663,0.700981,0.472915,0.670834,0.416127,0.477670,0.369311,0.534900,0.462763,...,0.412303,0.640170,0.630917,0.319564,0.266430,0.649776,0.560600,0.475839,0.677836,Adult_Female
2,ACH-002250,0.615111,0.614915,0.521417,0.732223,0.241653,0.589950,0.586772,0.519618,0.477772,...,0.406834,0.556543,0.687553,0.612463,0.500575,0.546082,0.606629,0.535493,0.699510,Adult_Male
3,ACH-002023,0.566400,0.621348,0.546900,0.583718,0.518087,0.470312,0.337324,0.493100,0.622767,...,0.618222,0.565410,0.634387,0.343665,0.299597,0.490982,0.628615,0.774380,0.366287,Adult_Female
4,ACH-000090,0.786461,0.445636,0.235732,0.249387,0.461363,0.700722,0.425250,1.000000,0.897726,...,0.656848,0.000000,0.054186,0.497661,1.000000,0.364889,0.410829,0.620475,0.253029,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,ACH-000900,0.569230,0.612294,0.489961,0.490399,0.670680,0.482467,0.385963,0.500708,0.338141,...,0.659975,0.466603,0.696718,0.487614,0.519515,0.363814,0.542506,0.506509,0.605951,Adult_Male
133,ACH-001129,0.966300,0.625705,0.398652,0.758634,0.402221,0.596715,0.534787,0.838149,0.684335,...,0.686629,0.741954,0.379209,0.623928,0.916165,0.928807,0.944274,0.480154,0.427394,Adult_Male
134,ACH-000665,0.467005,0.584403,0.798730,0.609295,0.507432,0.600171,0.579004,0.382871,0.703850,...,0.516739,0.580224,0.522528,0.428440,0.395920,0.321888,0.771623,0.705629,0.353709,Adult_Male
135,ACH-000405,0.521126,0.672400,0.676470,0.378247,0.249621,0.445368,0.750415,0.538228,0.552450,...,0.493990,0.779536,0.740287,0.431162,0.449916,0.000000,0.537322,0.842464,0.607707,Adult_Male


In [11]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(137, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000348,-0.083439,-0.039894,0.010377,0.113985,-0.28541,-0.079495,0.036602,-0.063939,-0.034673,...,-0.545069,0.013269,-0.196753,0.114824,-0.07706,-0.137285,-0.147227,-0.035045,-0.197649,Adult_Female
1,ACH-001041,-0.203043,0.01156,0.056424,0.158041,-0.196482,-0.028753,0.001179,-0.226252,-0.024476,...,-0.879321,0.078645,0.115316,-0.208417,-0.108645,0.027639,-0.109669,-0.273492,-0.171996,Adult_Female
2,ACH-002250,-0.005836,-0.061905,0.088103,0.195285,-0.343603,0.04767,0.112864,-0.248653,-0.015111,...,-0.88971,0.035355,0.163789,0.039258,-0.007928,-0.075173,-0.074534,-0.227281,-0.149779,Adult_Male


In [12]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(775, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001820,0.08217,0.112343,0.131823,0.239078,-0.022423,-0.041173,0.238446,-0.110234,-0.076089,...,0.010379,-0.281745,0.06022,0.037956,0.060752,0.005863,0.193068,-0.174241,-1.124031,Adult_Female
1,ACH-000632,-0.041168,-0.080259,0.012152,-0.019636,-0.217908,0.067214,0.026924,-0.251008,0.004988,...,-0.894837,0.029583,0.185571,-0.183375,0.058144,-0.018092,-0.077593,0.023973,-0.29295,Adult_Male
2,ACH-000403,-0.04401,-0.026977,0.108737,0.000527,-0.069168,0.026528,0.135067,-0.335133,-0.045133,...,-0.572335,0.005991,0.108309,-0.219163,-0.135795,-0.08153,-0.060584,-0.019579,-0.34555,Adult_Male
