In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1090    True
1091    True
1092    True
1093    True
1094    True
Name: ID_allignment_verify, Length: 1095, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000266,-0.028398,-0.022937,-0.006987,0.142746,-0.117743,-0.216685,0.033967,-0.161135,-0.027198,...,-0.355777,0.069332,0.099564,-0.025601,0.005394,-0.208340,-0.005360,-0.061719,-0.508618,Adult_Male
1,ACH-000430,-0.062519,0.106732,-0.036588,0.272937,-0.102398,-0.214351,0.175196,-0.173620,-0.056099,...,-0.665993,0.022698,0.106139,-0.009114,-0.016622,-0.251703,-0.331515,0.060063,-0.049055,Adult_Female
2,ACH-000558,-0.087929,0.415623,0.076157,0.127042,-0.151737,-0.019232,-0.010132,-0.377726,-0.036369,...,-0.306052,0.029401,-0.013930,-0.140352,-0.097930,-0.269515,0.054201,-0.170485,-0.229471,Adult_Male
3,ACH-000557,-0.049434,-0.052057,0.048716,0.041854,-0.266554,-0.170337,0.003633,-0.124267,0.024005,...,-0.776452,0.102088,0.161954,0.188481,-0.007581,-0.096916,-0.004062,-0.041053,-0.315617,Pediatric_Female
4,ACH-000704,-0.114227,0.112926,0.021755,0.525763,-0.325530,-0.075798,0.288807,-0.026511,-0.098270,...,-0.603837,-0.003486,0.057642,-0.418030,-0.194850,-0.257974,0.144630,-0.237777,-0.289710,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-000292,-0.160925,-0.105269,-0.109138,0.218616,-0.159203,0.031512,0.063845,-0.091290,0.047961,...,-0.135885,-0.042262,0.061725,-0.198449,0.005165,-0.179771,0.157140,-0.179504,-0.151184,Adult_Male
1091,ACH-001853,-0.062865,0.013948,-0.002094,-0.024171,0.059736,-0.106562,0.025951,-0.497406,-0.054150,...,-0.668549,0.055661,0.127750,0.109272,-0.139405,-0.078453,0.106574,-0.122756,-0.301331,Adult_Male
1092,ACH-000938,-0.147660,-0.053382,-0.025082,0.024069,-0.259204,0.057882,0.019421,0.199279,0.000672,...,-0.659749,-0.014582,0.164585,0.053554,-0.028535,-0.147715,0.112740,0.072861,-0.254835,Adult_Male
1093,ACH-000495,-0.142019,-0.112633,-0.023152,0.051131,-0.317712,-0.116598,0.132468,-0.575592,-0.127428,...,-0.020796,0.072230,0.045644,0.019910,-0.140784,-0.818626,-0.007163,-0.178739,-0.228241,Adult_Male


In [6]:
# preparing presplit dataframe to be scaled
col_num = len(presplit_effect_df.columns.to_list())
presplit_effect_scaled_df = presplit_effect_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
presplit_effect_scaled_df = scaler.fit_transform(presplit_effect_scaled_df)

# adding id column and age and sex column back
presplit_effect_scaled_df = pd.DataFrame(presplit_effect_scaled_df)
presplit_effect_scaled_df.insert(0, presplit_effect_df.columns[0], presplit_effect_df[presplit_effect_df.columns[0]])
presplit_effect_scaled_df.insert(col_num-1, presplit_effect_df.columns[col_num-1], presplit_effect_df[presplit_effect_df.columns[col_num-1]])
presplit_effect_scaled_df.columns = presplit_effect_df.columns
presplit_effect_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000266,0.520138,0.636154,0.431639,0.549511,0.588249,0.431003,0.518280,0.634707,0.648062,...,0.627577,0.601699,0.642562,0.525845,0.415214,0.511338,0.569373,0.630826,0.422142,Adult_Male
1,ACH-000430,0.486500,0.743732,0.393421,0.702637,0.603525,0.433397,0.682086,0.627311,0.624920,...,0.478605,0.558834,0.648916,0.544981,0.392719,0.475005,0.220455,0.749789,0.737379,Adult_Female
2,ACH-000558,0.461450,1.000000,0.538986,0.531041,0.554406,0.633532,0.467132,0.506403,0.640719,...,0.651456,0.564996,0.532885,0.392652,0.309645,0.460082,0.633092,0.524579,0.613623,Adult_Male
3,ACH-000557,0.499400,0.611995,0.503556,0.430846,0.440101,0.478542,0.483097,0.656546,0.689063,...,0.425560,0.631808,0.702854,0.774330,0.401956,0.604696,0.570762,0.651014,0.554531,Pediatric_Female
4,ACH-000704,0.435526,0.748871,0.468747,1.000000,0.381387,0.575512,0.813858,0.714455,0.591151,...,0.508453,0.534767,0.602049,0.070350,0.210619,0.469751,0.729832,0.458845,0.572302,Adult_Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-000292,0.389489,0.567848,0.299751,0.638747,0.546973,0.685580,0.552935,0.676081,0.708246,...,0.733174,0.499124,0.605995,0.325218,0.414979,0.535275,0.743215,0.515769,0.667324,Adult_Male
1091,ACH-001853,0.486159,0.666755,0.437957,0.353190,0.764936,0.543957,0.508983,0.435508,0.626480,...,0.477377,0.589133,0.669800,0.682392,0.267268,0.620165,0.689120,0.571203,0.564331,Adult_Male
1092,ACH-000938,0.402566,0.610896,0.408277,0.409929,0.447418,0.712628,0.501409,0.848207,0.670379,...,0.481603,0.524567,0.705396,0.617720,0.380548,0.562133,0.695717,0.762290,0.596225,Adult_Male
1093,ACH-000495,0.408127,0.561739,0.410769,0.441758,0.389171,0.533663,0.632527,0.389192,0.567803,...,0.788442,0.604363,0.590456,0.578669,0.265860,0.000000,0.567445,0.516516,0.614467,Adult_Male


In [7]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [8]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_scaled_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_scaled_df.loc[
    presplit_effect_scaled_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [9]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_effect_df, test_size=0.15, stratify=PA_effect_df.age_and_sex
)

In [10]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(137, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-002189,0.511344,0.41507,0.452344,0.472997,0.472125,0.554615,0.512139,0.623792,0.730746,...,0.164927,0.623615,0.590029,0.548699,0.333368,0.641548,0.502526,0.489526,0.436068,Adult_Male
1,ACH-002508,0.455775,0.611546,0.651625,0.518558,0.417817,0.477771,0.532775,0.475233,0.661932,...,0.697598,0.486749,0.832748,0.295948,0.454446,0.6572,0.419787,0.478215,0.780292,Adult_Female
2,ACH-002249,0.435182,0.674764,0.691628,0.568022,0.780051,0.779701,0.572729,0.634688,0.6679,...,0.412075,0.418333,0.626111,0.608158,0.286185,0.558428,0.497973,0.685813,0.581211,Adult_Male


In [11]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(775, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001098,0.516739,0.683159,0.468068,0.645882,0.402855,0.415601,0.628597,0.430881,0.77506,...,0.34865,0.386915,0.88207,0.570699,0.410865,0.623524,0.406286,0.662477,0.647628,Adult_Female
1,ACH-000191,0.414518,0.635406,0.647777,0.655013,0.645086,0.63451,0.562183,0.532213,0.533103,...,0.523889,0.485996,0.729811,0.524594,0.555595,0.512446,0.517305,0.484736,0.685059,Adult_Female
2,ACH-001664,0.526693,0.603641,0.506068,0.601432,0.571985,0.533206,0.578619,0.688221,0.715051,...,0.421047,0.539142,0.790404,0.549202,0.407552,0.565419,0.537362,0.633422,0.61298,Adult_Male
