In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1090    True
1091    True
1092    True
1093    True
1094    True
Name: ID_allignment_verify, Length: 1095, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002105,-0.314308,-0.148377,0.322449,0.288628,-0.118865,-0.027055,0.026010,0.057637,0.028003,...,-0.798326,-0.001375,0.018508,0.046260,-0.179293,-0.163784,-0.119450,0.006698,-0.851418,Adult_Male
1,ACH-000858,-0.009039,-0.045457,0.102832,0.090935,-0.243846,-0.108709,0.064013,-0.243934,0.052031,...,-0.764216,0.016776,0.080290,-0.092770,-0.030648,-0.172892,-0.050756,-0.150432,-0.572274,Adult_Male
2,ACH-000934,0.212504,-0.049721,-0.028394,0.126884,0.022341,-0.066939,0.338952,-0.208831,-0.150766,...,-0.440257,-0.111064,0.121104,-0.029618,-0.043777,-0.376247,0.012861,-0.209974,-0.193133,Adult_Female
3,ACH-000251,-0.054799,-0.077416,-0.007140,0.313804,-0.400648,0.029437,0.176232,-0.449228,-0.039122,...,-0.462346,0.097288,-0.055852,0.013131,0.055672,-0.285718,-0.146541,-0.177788,-0.480142,Adult_Male
4,ACH-001696,-0.097449,-0.008818,0.102460,0.097805,-0.088625,0.006080,-0.081125,-0.090428,0.016431,...,-0.854876,-0.002189,0.090821,-0.280045,0.012322,-0.156963,-0.026936,-0.165223,-0.447516,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-002284,-0.214917,-0.027022,0.167066,0.240833,-0.357411,-0.095335,0.053467,-0.214667,-0.051397,...,-0.552580,0.026223,0.070879,0.038203,-0.219777,-0.154838,-0.020252,-0.122063,-0.140589,Unknown_Male
1091,ACH-000487,-0.247309,-0.068908,-0.030456,0.114587,-0.109883,-0.063936,0.058313,-0.063875,-0.216803,...,-0.445278,0.111982,-0.018728,-0.021258,-0.027053,-0.220832,0.050112,-0.008104,-0.283321,Adult_Male
1092,ACH-001528,0.054497,0.000570,0.168840,0.139596,-0.211655,-0.129352,0.037875,-0.160878,-0.118007,...,-0.523774,-0.042974,0.191611,-0.047298,-0.147217,-0.224033,-0.124209,-0.003906,-0.361169,Adult_Male
1093,ACH-002486,-0.072754,-0.115541,0.011041,0.203901,-0.208569,-0.075561,0.022534,-0.169212,0.011657,...,-0.658883,-0.073363,0.176177,0.117710,0.021056,-0.124304,-0.153661,-0.040472,-0.129676,Adult_Female


In [6]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [7]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [8]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_effect_df, test_size=0.15, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [9]:
# preparing train dataframe to be scaled
col_num = train_df.shape[1]
train_scaled_df = train_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled_df = scaler.fit_transform(train_scaled_df)

# adding id column and age and sex column back
train_scaled_df = pd.DataFrame(train_scaled_df)
train_scaled_df.insert(0, train_df.columns[0], train_df[train_df.columns[0]])
train_scaled_df.insert(col_num-1, train_df.columns[col_num-1], train_df[train_df.columns[col_num-1]])
train_scaled_df.columns = train_df.columns
train_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001549,0.392275,0.667918,0.554767,0.368277,0.528467,0.560669,0.500658,0.688804,0.797533,...,0.677852,0.587598,0.577608,0.414359,0.412161,0.570502,0.555368,0.645499,0.594098,Adult_Male
1,ACH-001992,0.486033,0.647415,0.601267,0.522878,0.544102,0.608732,0.484285,0.087412,0.644856,...,0.433961,0.611164,0.546144,0.489531,0.464750,0.496016,0.517540,0.712793,0.577095,Adult_Male
2,ACH-000035,0.410732,0.648691,0.479683,0.393580,0.743450,0.713497,0.564007,0.551726,0.721885,...,0.349813,0.596096,0.694278,0.594866,0.381780,0.423628,0.586071,0.631242,0.631205,Adult_Male
3,ACH-000911,0.632961,0.597566,0.572424,0.508326,0.581603,0.704885,0.569375,0.655805,0.669587,...,0.643818,0.600972,0.692237,0.503732,0.333366,0.562517,0.605643,0.621942,0.464176,Adult_Male
4,ACH-000259,0.434355,0.655679,0.429820,0.439803,0.497398,0.558239,0.538433,0.651400,0.612164,...,0.719969,0.576706,0.743587,0.346348,0.413856,0.481636,0.586585,0.593089,0.496811,Pediatric_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,ACH-000335,0.368739,0.455084,0.360580,0.342994,0.933640,0.610947,0.483669,0.596631,0.805211,...,0.430520,0.720961,0.713599,0.444006,0.295493,0.485659,0.626263,0.447450,0.587142,Adult_Male
771,ACH-000336,0.492400,0.845974,0.292208,0.381554,0.672037,0.638091,0.495111,0.656449,0.635322,...,0.567281,0.540489,0.453944,0.318640,0.357341,0.582563,0.720862,0.478016,0.612214,Adult_Male
772,ACH-002083,0.443569,0.432300,0.522205,0.454116,0.735555,0.450572,0.608679,0.496599,0.492282,...,1.000000,0.508299,0.525602,0.416311,0.469107,0.506544,0.767484,0.659552,0.295207,Pediatric_Male
773,ACH-000188,0.386268,0.641716,0.396350,0.490796,0.488370,0.656236,0.474345,0.494346,0.656070,...,0.369867,0.469017,0.706775,0.502648,0.523998,0.565365,0.674665,0.509742,0.528352,Adult_Male


In [10]:
# preparing test dataframe to be scaled
col_num = test_df.shape[1]
test_scaled_df = test_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
test_scaled_df = scaler.fit_transform(test_scaled_df)

# adding id column and age and sex column back
test_scaled_df = pd.DataFrame(test_scaled_df)
test_scaled_df.insert(0, test_df.columns[0], test_df[test_df.columns[0]])
test_scaled_df.insert(col_num-1, test_df.columns[col_num-1], test_df[test_df.columns[col_num-1]])
test_scaled_df.columns = test_df.columns
test_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001494,0.541986,0.532081,0.558597,0.637316,0.738237,0.298841,0.517692,0.619299,0.396070,...,0.321978,0.467188,0.624147,0.748642,0.326285,0.702321,0.703554,0.897774,0.676829,Adult_Male
1,ACH-000435,0.900268,0.299823,0.495381,0.691276,0.062434,0.680200,0.712086,0.701041,0.259551,...,0.652436,0.612733,0.585070,0.539100,0.217743,0.709538,0.798784,0.426370,0.402523,Adult_Female
2,ACH-000356,0.856418,0.298030,0.624670,0.582829,0.554512,0.450514,0.561849,0.447229,0.670693,...,0.620012,0.387241,0.305425,0.729928,0.108395,0.471211,0.774960,0.612652,0.728367,Adult_Female
3,ACH-001418,0.627381,0.528661,0.459549,0.652843,0.411618,0.610103,0.485013,0.653038,0.585999,...,0.892157,0.654613,0.342001,0.571694,0.447987,0.780803,0.620892,0.572722,0.588297,Adult_Female
4,ACH-001555,0.353171,0.791061,0.622373,0.770904,1.000000,0.369688,0.790718,0.729390,0.992792,...,0.778903,0.123717,0.443134,0.601769,0.361396,0.404181,0.581646,0.528508,0.435142,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,ACH-000858,0.608559,0.482628,0.613796,0.557921,0.382209,0.386368,0.488799,0.586984,0.621544,...,0.504009,0.459919,0.522764,0.516326,0.308563,0.719970,0.547629,0.647459,0.431649,Adult_Male
133,ACH-000514,0.728982,0.351283,0.485137,0.610238,0.369751,0.390339,0.555344,0.743735,0.487475,...,0.623386,0.223294,0.510811,0.533318,0.302627,0.769976,0.693454,0.493533,0.628537,Adult_Male
134,ACH-000258,0.312877,0.429836,0.511027,0.911883,0.749401,0.541085,0.529573,0.468142,0.369121,...,0.681045,0.613741,0.761569,0.550571,0.382369,0.559220,1.000000,0.000000,0.497427,Adult_Female
135,ACH-002446,0.252033,0.365809,0.625954,0.696098,0.175052,0.467192,0.523135,0.515874,0.705227,...,0.554313,0.586963,0.606731,0.445248,0.279238,0.659141,0.418679,0.668915,0.726900,Adult_Male


In [11]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(137, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001494,-0.040113,-0.0148,0.070202,0.140399,-0.018159,-0.162754,0.083947,-0.210754,-0.075114,...,-1.037056,0.020646,0.148957,0.064287,-0.015054,-0.188722,0.074684,0.064936,-0.31454,Adult_Male
1,ACH-000435,0.12712,-0.158781,0.032833,0.174016,-0.446554,0.072724,0.218064,-0.126822,-0.152098,...,-0.541746,0.098139,0.12249,-0.077374,-0.11056,-0.182248,0.151294,-0.340655,-0.602892,Adult_Female
2,ACH-000356,0.106652,-0.159892,0.109261,0.106453,-0.134623,-0.069101,0.114412,-0.387433,0.079746,...,-0.590345,-0.02192,-0.066912,0.051635,-0.206775,-0.396002,0.132128,-0.18038,-0.260363,Adult_Female


In [12]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(775, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001549,-0.158099,0.015349,0.08838,0.036363,-0.177793,-0.090268,0.024078,-0.044344,0.159466,...,-0.251086,0.053991,0.032349,-0.12165,-0.054366,-0.106974,-0.018452,-0.034391,-0.257935,Adult_Male
1,ACH-001992,-0.062993,-0.009363,0.124396,0.156133,-0.162087,-0.04341,0.010135,-1.010345,-0.031202,...,-0.758958,0.079629,-0.00021,-0.056886,-0.010138,-0.19054,-0.053812,0.032161,-0.282723,Adult_Male
2,ACH-000035,-0.139376,-0.007825,0.030225,0.055965,0.038153,0.058729,0.078023,-0.264528,0.064994,...,-0.934184,0.063236,0.15308,0.033864,-0.079916,-0.271753,0.010248,-0.048492,-0.203839,Adult_Male
