In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

sys.path.insert(0, "../0.data-download/scripts/")
from data_loader import load_data
from sklearn.model_selection import train_test_split
import random

In [2]:
random.seed(18)
print(random.random())

0.18126486333322134


In [3]:
# load all of the data
data_directory = "../0.data-download/data/"
model_df, effect_df = load_data(data_directory, adult_or_pediatric="all")

In [4]:
# verifying that the ModelIDs in model_df and effect_df are alligned
model_df["ID_allignment_verify"] = np.where(
    effect_df["ModelID"] == model_df["ModelID"], "True", "False"
)
verrify = len(model_df["ID_allignment_verify"].unique())
print(model_df["ID_allignment_verify"])
print(
    f"There is {verrify} output object contained in the ID_allignment_verify column \n"
)

0       True
1       True
2       True
3       True
4       True
        ... 
1090    True
1091    True
1092    True
1093    True
1094    True
Name: ID_allignment_verify, Length: 1095, dtype: object
There is 1 output object contained in the ID_allignment_verify column 



In [5]:
# assign 'AgeCategory' and 'Sex' columns to the effect dataframe as a single column
presplit_effect_df = effect_df.assign(
    age_and_sex=model_df.AgeCategory.astype(str) + "_" + model_df.Sex.astype(str)
)
presplit_effect_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001685,-0.087372,0.015117,0.048985,0.077843,0.056235,-0.244673,-0.005789,-0.640823,0.000453,...,-1.265692,-0.154120,0.096670,0.006034,0.095984,-0.129518,-0.076052,-0.398696,-0.297761,Adult_Male
1,ACH-000174,0.000897,-0.066240,0.079987,0.188907,-0.085974,-0.117544,0.045458,-0.261677,0.161824,...,-0.784581,0.083027,0.130791,0.043521,-0.027856,-0.209896,0.024099,0.053715,-0.307648,Adult_Female
2,ACH-000437,0.024594,-0.048088,0.012041,0.139078,-0.178267,-0.118532,0.084849,-0.533820,-0.145896,...,-1.105526,0.030498,-0.156198,-0.028790,-0.075081,-0.210636,-0.099105,-0.064418,-0.310778,Adult_Male
3,ACH-002249,-0.114575,0.023601,0.194384,0.158485,0.074918,0.123274,0.080912,-0.161167,-0.002424,...,-0.804532,-0.130158,0.082540,0.045316,-0.120891,-0.152138,-0.072103,-0.005430,-0.276722,Adult_Male
4,ACH-001053,-0.001340,-0.074294,-0.014928,0.145420,-0.128940,-0.230428,0.167884,-0.137224,0.209189,...,-0.757539,-0.012574,0.076976,0.054017,-0.066283,-0.481406,-0.322921,0.098200,-0.401689,Pediatric_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,ACH-000200,0.113405,-0.055428,0.124777,0.095393,-0.114553,-0.169954,0.045695,-0.125983,0.226319,...,-0.417783,-0.070983,0.166021,0.147979,0.171044,-0.172837,0.051541,-0.045085,-0.738485,Unknown_Female
1091,ACH-001653,0.066221,-0.073017,0.119258,0.184409,-0.085062,-0.072057,0.053095,-0.348875,0.093912,...,-1.161084,0.014566,0.315839,0.032362,-0.036103,-0.150099,0.076183,-0.071903,-0.536946,Adult_Male
1092,ACH-000876,0.076396,-0.195150,0.091908,0.216859,-0.280108,0.023805,0.116786,-0.299817,-0.045841,...,-0.564917,-0.116034,0.010508,-0.161025,-0.150296,-0.037239,-0.216403,-0.000845,-0.620540,Adult_Female
1093,ACH-001563,0.037377,-0.016801,0.155830,0.141208,-0.220587,-0.078475,0.133508,-0.205188,-0.012399,...,-0.769782,-0.099926,0.097558,0.190618,-0.036267,-0.098394,-0.468922,-0.061461,-0.674718,Unknown_Male


In [7]:
groups = model_df.groupby("AgeCategory")
df_list = []
for name, df in groups:

    # only looking for samples that contain Adult or Pediatric information
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)

# merge sample dataframes through concatentation and reorganize so that ModelIDs are in alphabetical order
new_df = pd.concat(df_list, axis=0)
new_df = new_df.set_index("ModelID")
new_df = new_df.sort_index(ascending=True)
new_df = new_df.reset_index()
new_df

Unnamed: 0,ModelID,index,PatientID,CellLineName,StrippedCellLineName,Age,SourceType,SangerModelID,RRID,DepmapModelType,...,WTSIMasterCellID,EngineeredModel,TreatmentStatus,OnboardedMedia,PlateCoating,OncotreeCode,OncotreeSubtype,OncotreePrimaryDisease,OncotreeLineage,ID_allignment_verify
0,ACH-000001,0,PT-gj46wT,NIH:OVCAR-3,NIHOVCAR3,60.0,Commercial,SIDM00105,CVCL_0465,HGSOC,...,2201.0,,,MF-001-041,,HGSOC,High-Grade Serous Ovarian Cancer,Ovarian Epithelial Tumor,Ovary/Fallopian Tube,True
1,ACH-000004,3,PT-q4K2cp,HEL,HEL,30.0,Commercial,SIDM00594,CVCL_0001,AML,...,783.0,,Post-treatment,MF-001-001,,AML,Acute Myeloid Leukemia,Acute Myeloid Leukemia,Myeloid,True
2,ACH-000005,4,PT-q4K2cp,HEL 92.1.7,HEL9217,30.0,Commercial,SIDM00593,CVCL_2481,AML,...,,,,MF-001-001,,AML,Acute Myeloid Leukemia,Acute Myeloid Leukemia,Myeloid,True
3,ACH-000007,6,PT-NOXwpH,LS513,LS513,63.0,Commercial,SIDM00677,CVCL_1386,COAD,...,569.0,,,MF-001-001,,COAD,Colon Adenocarcinoma,Colorectal Adenocarcinoma,Bowel,True
4,ACH-000009,8,PT-puKIyc,C2BBe1,C2BBE1,72.0,Commercial,SIDM01233,CVCL_1096,COAD,...,2104.0,,,MF-002-021,,COAD,Colon Adenocarcinoma,Colorectal Adenocarcinoma,Bowel,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,ACH-002785,1841,PT-6sPicj,NCC-LMS1-C1,NCCLMS1C1,42.0,Academic lab,,CVCL_LK56,LMS,...,,,,MF-007-006,,LMS,Leiomyosarcoma,Leiomyosarcoma,Soft Tissue,True
908,ACH-002799,1842,PT-qqzgSc,NCC-MPNST1-C1,NCCMPNST1C1,37.0,Academic lab,,CVCL_YU12,MPNST,...,,,,MF-007-006,,MPNST,Malignant Peripheral Nerve Sheath Tumor,Nerve Sheath Tumor,Peripheral Nervous System,True
909,ACH-002800,1843,PT-WTpRkW,NCC-MPNST2-C1,NCCMPNST2C1,54.0,Academic lab,,CVCL_YU13,MPNST,...,,,,MF-007-006,,MPNST,Malignant Peripheral Nerve Sheath Tumor,Nerve Sheath Tumor,Peripheral Nervous System,True
910,ACH-002847,1848,PT-AFnHpd,YUHOIN 06-50,YUHOIN0650,59.0,Academic lab,,CVCL_J521,MEL,...,,,,MF-036-001,,MEL,Melanoma,Melanoma,Skin,True


In [8]:
# creating a list of ModelIDs that correlate to pediatric and adult samples
PA_effect_IDs = new_df["ModelID"].tolist()

PA_IDs = set(PA_effect_IDs) & set(presplit_effect_df["ModelID"].tolist())

# creating a new gene effect data frame containing correlating ModelIDs to the filtered sample info IDs
PA_effect_df = presplit_effect_df.loc[
    presplit_effect_df["ModelID"].isin(PA_IDs)
].reset_index(drop=True)

In [9]:
# split the data based on age category and sex
train_df, test_df = train_test_split(
    PA_effect_df, test_size=0.15, stratify=PA_effect_df.age_and_sex
)
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [16]:
# preparing train dataframe to be scaled  (Removing ID column and Age and Sex column)
col_num = train_df.shape[1]
train_scaled_df = train_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled_df = scaler.fit_transform(train_scaled_df)

# adding id column and age and sex column back
train_scaled_df = pd.DataFrame(train_scaled_df)
train_scaled_df.insert(0, train_df.columns[0], train_df[train_df.columns[0]])
train_scaled_df.insert(col_num-1, train_df.columns[col_num-1], train_df[train_df.columns[col_num-1]])
train_scaled_df.columns = train_df.columns
train_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000563,0.489412,0.532876,0.561625,0.486711,0.358506,0.000000,0.566452,0.461005,0.726898,...,0.466162,0.705178,0.680051,0.568967,0.457548,0.575281,0.831034,0.510403,0.558231,Adult_Male
1,ACH-000835,0.508318,0.657452,0.483523,0.472117,0.506601,0.590136,0.542725,0.515519,0.678960,...,0.475679,0.640583,0.709266,0.541939,0.313601,0.575653,0.614657,0.530118,0.453958,Adult_Male
2,ACH-000614,0.536558,0.547955,0.530052,0.562705,0.500915,0.557939,0.531167,0.561759,0.684210,...,0.621348,0.531288,0.679634,0.593566,0.414176,0.673944,0.513534,0.471400,0.480055,Adult_Male
3,ACH-000682,0.481142,0.645998,0.498143,0.537203,0.799259,0.491496,0.489206,0.660863,0.725630,...,0.300575,0.759472,0.642169,0.481734,0.351173,0.446925,0.765688,0.607874,0.587424,Adult_Male
4,ACH-000623,0.432918,0.672769,0.382495,0.562286,0.562317,0.681582,0.324802,0.508789,0.627083,...,0.582966,0.688476,0.506635,0.368026,0.333722,0.448697,0.405630,0.531976,0.639055,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,ACH-001550,0.601868,0.654382,0.707341,0.366540,0.255491,0.684497,0.674783,0.337501,0.582176,...,0.588235,0.575667,0.771776,0.548142,0.383370,0.632336,0.543701,0.714784,0.641448,Adult_Female
771,ACH-001333,0.529367,0.519313,0.664246,0.403516,0.541083,0.606633,0.330890,0.591067,0.732643,...,0.563407,0.708108,0.565224,0.482222,0.134654,0.656207,0.507214,0.622584,0.723950,Adult_Female
772,ACH-002531,0.620491,0.607966,0.479939,0.440215,0.485448,0.498014,0.544036,0.600907,0.831001,...,0.415871,0.606602,0.748434,0.626812,0.474748,0.554496,0.599786,0.609184,0.672082,Adult_Female
773,ACH-000861,0.697076,0.439334,0.622811,0.357931,0.599079,0.596957,0.542910,0.577288,0.718779,...,0.625436,0.391801,0.308270,0.559759,0.371694,0.499611,0.415019,0.507498,0.557178,Adult_Female


In [10]:
# preparing test dataframe to be scaled (Removing ID column and Age and Sex column)
col_num = test_df.shape[1]
test_scaled_df = test_df.iloc[:, 1:col_num-1]

# scaling gene effect data to 0-1 range
scaler = MinMaxScaler(feature_range=(0,1))
test_scaled_df = scaler.fit_transform(test_scaled_df)

# adding id column and age and sex column back
test_scaled_df = pd.DataFrame(test_scaled_df)
test_scaled_df.insert(0, test_df.columns[0], test_df[test_df.columns[0]])
test_scaled_df.insert(col_num-1, test_df.columns[col_num-1], test_df[test_df.columns[col_num-1]])
test_scaled_df.columns = test_df.columns
test_scaled_df

Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-001494,0.541986,0.532081,0.558597,0.637316,0.738237,0.298841,0.517692,0.619299,0.396070,...,0.321978,0.467188,0.624147,0.748642,0.326285,0.702321,0.703554,0.897774,0.676829,Adult_Male
1,ACH-000435,0.900268,0.299823,0.495381,0.691276,0.062434,0.680200,0.712086,0.701041,0.259551,...,0.652436,0.612733,0.585070,0.539100,0.217743,0.709538,0.798784,0.426370,0.402523,Adult_Female
2,ACH-000356,0.856418,0.298030,0.624670,0.582829,0.554512,0.450514,0.561849,0.447229,0.670693,...,0.620012,0.387241,0.305425,0.729928,0.108395,0.471211,0.774960,0.612652,0.728367,Adult_Female
3,ACH-001418,0.627381,0.528661,0.459549,0.652843,0.411618,0.610103,0.485013,0.653038,0.585999,...,0.892157,0.654613,0.342001,0.571694,0.447987,0.780803,0.620892,0.572722,0.588297,Adult_Female
4,ACH-001555,0.353171,0.791061,0.622373,0.770904,1.000000,0.369688,0.790718,0.729390,0.992792,...,0.778903,0.123717,0.443134,0.601769,0.361396,0.404181,0.581646,0.528508,0.435142,Adult_Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,ACH-000858,0.608559,0.482628,0.613796,0.557921,0.382209,0.386368,0.488799,0.586984,0.621544,...,0.504009,0.459919,0.522764,0.516326,0.308563,0.719970,0.547629,0.647459,0.431649,Adult_Male
133,ACH-000514,0.728982,0.351283,0.485137,0.610238,0.369751,0.390339,0.555344,0.743735,0.487475,...,0.623386,0.223294,0.510811,0.533318,0.302627,0.769976,0.693454,0.493533,0.628537,Adult_Male
134,ACH-000258,0.312877,0.429836,0.511027,0.911883,0.749401,0.541085,0.529573,0.468142,0.369121,...,0.681045,0.613741,0.761569,0.550571,0.382369,0.559220,1.000000,0.000000,0.497427,Adult_Female
135,ACH-002446,0.252033,0.365809,0.625954,0.696098,0.175052,0.467192,0.523135,0.515874,0.705227,...,0.554313,0.586963,0.606731,0.445248,0.279238,0.659141,0.418679,0.668915,0.726900,Adult_Male


In [17]:
# save the TESTING dataframe
test_df = test_df.reset_index(drop=True)
testing_df_output = pathlib.Path("../0.data-download/data/VAE_test_df.csv")
test_df.to_csv(testing_df_output, index=False)
print(test_df.shape)
test_df.head(3)

(137, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000070,0.01997,-0.057256,0.070187,0.261183,-0.226158,0.032391,0.074601,0.006518,0.054415,...,-0.276654,0.10114,0.259753,-0.06866,-0.154482,-0.057477,-0.161584,-0.118518,-0.346338,Pediatric_Male
1,ACH-000424,-0.019564,-0.033171,0.09679,0.189534,-0.10604,-0.133097,0.031001,-0.28069,0.197665,...,-0.112876,0.071519,0.044086,-0.07497,0.013155,-0.097721,0.001975,-0.089875,-0.219496,Adult_Male
2,ACH-000510,0.001227,-0.005976,0.011694,0.07963,-0.070977,0.059904,0.03937,-0.192833,-0.03221,...,-0.200556,0.007017,0.155182,0.044714,-0.152854,-0.123519,-0.028177,-0.014255,-0.355884,Adult_Male


In [18]:
# save the TRAINING dataframe
train_df = train_df.reset_index(drop=True)
training_df_output = pathlib.Path("../0.data-download/data/VAE_train_df.csv")
train_df.to_csv(training_df_output, index=False)
print(train_df.shape)
train_df.head(3)

(775, 17204)


Unnamed: 0,ModelID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),age_and_sex
0,ACH-000563,-0.159728,-0.147423,0.093692,0.124156,-0.348514,-0.636887,0.080105,-0.410251,0.071254,...,-0.691902,0.090263,0.138357,0.011552,0.046828,-0.132023,0.239229,-0.184998,-0.310223,Adult_Male
1,ACH-000835,-0.14442,0.002735,0.033199,0.112737,-0.199756,-0.06154,0.0599,-0.322688,0.011388,...,-0.672085,0.028383,0.168589,-0.011735,-0.094057,-0.131579,0.036969,-0.164815,-0.462236,Adult_Male
2,ACH-000614,-0.121553,-0.129247,0.069238,0.183615,-0.205467,-0.09293,0.050058,-0.248412,0.017945,...,-0.368747,-0.076318,0.137926,0.032744,0.004378,-0.014267,-0.057556,-0.224924,-0.424191,Adult_Male
