In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, dependency_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the DepMap_IDs in model_df and dependency_df are alligned
model_df["ID_allignment_verify"] = np.where(
    dependency_df["DepMap_ID"] == model_df["DepMap_ID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1081    True
1082    True
1083    True
1084    True
1085    True
Name: ID_allignment_verify, Length: 1086, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'age_categories' and 'sex' columns to the dependency dataframe as a single column
presplit_dependency_df = dependency_df.assign(
    age_and_sex=model_df.age_categories.astype(str) + "_" + model_df.sex.astype(str)
)
presplit_dependency_df

Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZW10 (9183),ZWILCH (55055),ZWINT (11130),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000001,0.094568,0.012519,0.027460,0.025962,0.073412,0.027340,0.020199,0.284733,0.022084,...,0.555867,0.037449,0.080585,0.004241,0.082956,0.012000,0.003592,0.012679,0.324623,Adult_Female
1,ACH-000004,0.012676,0.049011,0.075933,0.033215,0.013176,0.097497,0.005015,0.153166,0.007358,...,0.007427,0.038768,0.230569,0.011203,0.060266,0.128375,0.005911,0.004645,0.042530,Adult_Male
2,ACH-000005,0.053957,0.027968,0.010139,0.005448,0.018599,0.081636,0.005457,0.159904,0.050884,...,0.071035,0.017479,0.274568,0.033416,0.034712,0.092832,0.012482,0.020843,0.050412,Adult_Male
3,ACH-000007,0.026704,0.083588,0.008853,0.011299,0.027288,0.028349,0.032573,0.166503,0.047045,...,0.213754,0.196233,0.615338,0.005432,0.035241,0.138445,0.103161,0.146222,0.274833,Adult_Male
4,ACH-000009,0.059383,0.051826,0.015370,0.011721,0.030062,0.078373,0.042128,0.184783,0.032335,...,0.169463,0.152385,0.405712,0.056461,0.189550,0.328064,0.035161,0.058402,0.269194,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,ACH-002834,0.012720,0.083840,0.013308,0.013315,0.107286,0.079063,0.013340,0.717478,0.040061,...,0.493501,0.027170,0.636352,0.029567,0.029711,0.024132,0.174946,0.091510,0.421153,Missing_Female
1082,ACH-002847,0.058547,0.079576,0.036707,0.030234,0.069976,0.032828,0.012784,0.476383,0.096910,...,0.731151,0.476423,0.967625,0.010776,0.072452,0.131375,0.115486,0.094022,0.438092,Adult_Female
1083,ACH-002874,0.169875,0.017430,0.017965,0.008615,0.117915,0.075693,0.087961,0.144152,0.140377,...,0.462012,0.710398,0.747477,0.017923,0.077515,0.094231,0.166188,0.058117,0.302708,Adult_Female
1084,ACH-002875,0.035519,0.099789,0.071748,0.048479,0.245413,0.154663,0.045155,0.324723,0.089083,...,0.090844,0.329219,0.168367,0.017571,0.101216,0.087739,0.050947,0.182081,0.349872,Adult_Male


In [6]:
groups = model_df.groupby("age_categories")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that DepMap_IDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("DepMap_ID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()

In [7]:
# creating a list of DepMap_IDs that correlate to pediatric and adult samples
PA_dependency_IDs = new_df["DepMap_ID"].tolist()

PA_IDs = set(PA_dependency_IDs) & set(presplit_dependency_df["DepMap_ID"].tolist())

# creating a new gene dependency data frame containing correlating DepMap_IDs to the filtered sample info IDs
PA_dependency_df = presplit_dependency_df.loc[
    presplit_dependency_df["DepMap_ID"].isin(PA_IDs)
].reset_index(drop=True)

In [8]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_dependency_df, test_size=0.15, stratify=PA_dependency_df.age_and_sex
)

In [9]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(133, 16710)


Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZW10 (9183),ZWILCH (55055),ZWINT (11130),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000274,0.033623,0.031185,0.035355,0.052351,0.156149,0.091871,0.06304,0.260657,0.049294,...,0.340601,0.204846,0.277389,0.023174,0.074124,0.1453,0.061529,0.044921,0.283505,Adult_Male
1,ACH-000288,0.016983,0.05823,0.014237,0.042621,0.116682,0.023993,0.010857,0.558349,0.076136,...,0.204487,0.104914,0.287556,0.042241,0.030508,0.0329,0.522408,0.19741,0.627564,Adult_Female
2,ACH-000701,0.146827,0.074509,0.034836,0.012635,0.149864,0.014481,0.043878,0.482275,0.023246,...,0.01122,0.021019,0.878102,0.063242,0.046081,0.025801,0.030127,0.029867,0.478795,Adult_Female


In [10]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(751, 16710)


Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZW10 (9183),ZWILCH (55055),ZWINT (11130),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000438,0.011771,0.034266,0.015891,0.003441,0.070728,0.091638,0.02627,0.093068,0.013842,...,0.160328,0.038954,0.699979,0.006463,0.112275,0.061215,0.036532,0.021651,0.358402,Adult_Male
1,ACH-000903,0.007686,0.111137,0.011976,0.007439,0.021424,0.111798,0.031786,0.257868,0.037013,...,0.170732,0.250104,0.876435,0.004359,0.084443,0.041802,0.017984,0.046754,0.476792,Adult_Male
2,ACH-000271,0.178274,0.022445,0.050981,0.034388,0.083899,0.073445,0.024923,0.502983,0.052048,...,0.444796,0.069423,0.150791,0.017066,0.06111,0.021301,0.083719,0.056561,0.067086,Adult_Male
