In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
sys.path.insert(0, "./0.data-download/scripts/")
from data_loader import load_data
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.decomposition import PCA
from tensorflow import keras

from tensorflow.keras.models import Model, Sequential
import seaborn
import random as python_random
import tensorflow as tf

In [2]:
# load the data 
data_directory = "./0.data-download/data/"
dfs = load_data(data_directory, adult_or_pediatric = "all")
dependency_df = dfs[1]
sample_df = dfs[0]

In [24]:
# searching for nulls
nan_rows  = dependency_df[dependency_df.isna().any(axis=1)]
nan_rows

Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
8,ACH-000014,0.035633,0.06664,0.014464,0.015126,0.03723,0.086036,0.351851,0.100912,0.122037,...,0.211058,0.937716,,,0.090195,0.052079,0.025882,0.24309,0.003396,0.0773
121,ACH-000216,0.077973,0.00303,0.018662,0.01308,0.35788,0.573855,0.020253,0.195401,0.023624,...,0.377689,0.336256,,,0.041339,0.035606,0.219835,0.05786,0.031608,0.579288
387,ACH-000600,0.015761,0.04499,0.009926,0.023895,0.165023,0.031257,0.029051,0.071808,0.014706,...,0.236839,0.647833,,,0.021489,0.030027,0.084744,0.032071,0.045335,0.133621
425,ACH-000658,0.087185,0.008177,0.022151,0.005423,0.035529,0.01566,0.115511,0.410175,0.011358,...,0.048619,0.183044,,,0.046237,0.034565,0.090532,0.0317,0.024452,0.348789
557,ACH-000854,0.03575,0.043327,0.010934,0.015883,0.219869,0.09741,0.041822,0.810608,0.033313,...,0.106307,0.417082,,,0.004158,0.011147,0.046325,0.047737,0.054087,0.231258


In [4]:
groups = sample_df.groupby("age_categories")
df_list = []
for name, df in groups:
    
    # only looking for adult pediatric
    if name == "Adult" or name == "Pediatric":
        df_list.append(df)
        
# merge dataframes through concatentation 
new_df = pd.concat(df_list, axis=0)
new_df

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,default_growth_pattern,model_manipulation,model_manipulation_details,patient_id,parent_depmap_id,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues,age_categories,age_distribution
0,ACH-000076,NCO2,NCO2,NCO2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,HSRRB,CVCL_3043,,...,,,,PT-Ugji7b,,"Chronic myelogenous leukemia, BCR-ABL1 positive",C3174,,Adult,35.0
1,ACH-000077,MJ,MJ,MJ_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Male,ATCC,CVCL_1414,,...,2D: suspension,,,PT-brIUSU,,Primary cutaneous T-cell non-Hodgkin lymphoma,C3467,,Adult,50.0
2,ACH-000093,Panc 05.04,PANC0504,PANC0504_PANCREAS,,,Female,ATCC,CVCL_1637,,...,2D: adherent,,,PT-aJgS51,,Pancreatic ductal adenocarcinoma,C9120,,Adult,77.0
4,ACH-000216,JH-EsoAd1,JHESOAD1,JHESOAD1_OESOPHAGUS,,,Male,Academic lab,CVCL_8098,,...,,,,PT-Y4syw7,,Barrett adenocarcinoma,C7027,,Adult,66.0
5,ACH-000288,BT-549,BT549,BT549_BREAST,,905951.0,Female,ATCC,CVCL_1092,1835.0,...,2D: adherent,,,PT-hgWgpg,,Invasive breast carcinoma of no special type,C4194,,Adult,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073,ACH-002278,NB10,NB10,NB10_AUTONOMIC_GANGLIA,,949171.0,Male,Sanger,CVCL_1441,1033.0,...,,,,PT-1wIoRq,,Neuroblastoma,C3270,,Pediatric,2.0
1074,ACH-002280,NB13,NB13,NB13_AUTONOMIC_GANGLIA,,949177.0,Male,Sanger,CVCL_1443,215.0,...,,,,PT-pJmC9b,,Neuroblastoma,C3270,,Pediatric,1.0
1075,ACH-002282,NB17,NB17,NB17_AUTONOMIC_GANGLIA,,949175.0,Female,Sanger,CVCL_1445,159.0,...,,,,PT-KDwcNS,,Neuroblastoma,C3270,,Pediatric,1.0
1076,ACH-002283,NB5,NB5,NB5_AUTONOMIC_GANGLIA,,949176.0,Female,Sanger,CVCL_8822,2071.0,...,2D: adherent,,,PT-x400Sq,,Neuroblastoma,C3270,,Pediatric,2.0


In [5]:
ref_df = new_df[["DepMap_ID", "sex", "age_categories"]]

In [6]:
ref_df.loc[ref_df["age_categories"] == "Pediatric"].reset_index(drop=True)

Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000172,Male,Pediatric
1,ACH-000602,Female,Pediatric
2,ACH-001099,Female,Pediatric
3,ACH-001163,Male,Pediatric
4,ACH-001164,Male,Pediatric
...,...,...,...
116,ACH-002278,Male,Pediatric
117,ACH-002280,Male,Pediatric
118,ACH-002282,Female,Pediatric
119,ACH-002283,Female,Pediatric


In [7]:
# data frame containing ALL the PEDIATRIC samples
bulk_pediatric_training_df = ref_df.loc[ref_df["age_categories"] == "Pediatric"].reset_index(drop=True)
print(bulk_pediatric_training_df.shape)
bulk_pediatric_training_df.head(3)

(121, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000172,Male,Pediatric
1,ACH-000602,Female,Pediatric
2,ACH-001099,Female,Pediatric


In [8]:
# data frame containing ALL the ADULT samples
bulk_adult_training_df = ref_df.loc[ref_df["age_categories"] == "Adult"].reset_index(drop=True)
print(bulk_adult_training_df.shape)
bulk_adult_training_df.head(3)

(763, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000076,Female,Adult
1,ACH-000077,Male,Adult
2,ACH-000093,Female,Adult


In [9]:
# sorting out 103 rows (85% of the PEDIATRIC samples) for the TRAINING data frame

pre_merge_pediatric_training_df = bulk_pediatric_training_df[0:103].reset_index(drop=True)
print(pre_merge_pediatric_training_df.shape)
pre_merge_pediatric_training_df.head(3)

(103, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000172,Male,Pediatric
1,ACH-000602,Female,Pediatric
2,ACH-001099,Female,Pediatric


In [10]:
# sorting out 18 rows (15% of the PEDIATRIC samples) for the TESTING data frame
pre_merge_pediatric_testing_df = bulk_pediatric_training_df[103:].reset_index(drop=True)
print(pre_merge_pediatric_testing_df.shape)
pre_merge_pediatric_testing_df.head(3)

(18, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-001715,Male,Pediatric
1,ACH-001735,Female,Pediatric
2,ACH-001740,Male,Pediatric


In [11]:
# sorting out 649 rows (85% of the ADULT samples) for the TRAINING data frame
pre_merge_adult_training_df = bulk_adult_training_df[0:649]
print(pre_merge_adult_training_df.shape)
pre_merge_adult_training_df.head(3)

(649, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000076,Female,Adult
1,ACH-000077,Male,Adult
2,ACH-000093,Female,Adult


In [12]:
# sorting out 114 rows (15% of the ADULT samples) for the TESTING data frame
pre_merge_adult_testing_df = bulk_adult_training_df[649:].reset_index(drop=True)
print(pre_merge_adult_testing_df.shape)
pre_merge_adult_testing_df.head(3)

(114, 3)


Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-001283,Male,Adult
1,ACH-001306,Female,Adult
2,ACH-001307,Female,Adult


In [13]:
# merging the TRAINING data frames 
training_merge_frames = [pre_merge_adult_training_df, pre_merge_pediatric_training_df]
training_df_IDs = pd.concat(training_merge_frames).reset_index(drop=True)
training_df_IDs

Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-000076,Female,Adult
1,ACH-000077,Male,Adult
2,ACH-000093,Female,Adult
3,ACH-000216,Male,Adult
4,ACH-000288,Female,Adult
...,...,...,...
747,ACH-001532,Male,Pediatric
748,ACH-001603,Male,Pediatric
749,ACH-001669,Male,Pediatric
750,ACH-001674,Male,Pediatric


In [14]:
# merging the TRAINING data frames 
testing_merge_frames = [pre_merge_adult_testing_df, pre_merge_pediatric_testing_df]
testing_df_IDs = pd.concat(testing_merge_frames).reset_index(drop=True)
testing_df_IDs

Unnamed: 0,DepMap_ID,sex,age_categories
0,ACH-001283,Male,Adult
1,ACH-001306,Female,Adult
2,ACH-001307,Female,Adult
3,ACH-001318,Male,Adult
4,ACH-001328,Female,Adult
...,...,...,...
127,ACH-002278,Male,Pediatric
128,ACH-002280,Male,Pediatric
129,ACH-002282,Female,Pediatric
130,ACH-002283,Female,Pediatric


In [15]:
print(dependency_df.shape)
dependency_df.head()

(1086, 17387)


Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
0,ACH-000001,0.094568,0.012519,0.02746,0.025962,0.073412,0.02734,0.020199,0.284733,0.022084,...,0.037449,0.080585,0.034309,0.007142,0.004241,0.082956,0.012,0.003592,0.012679,0.324623
1,ACH-000004,0.012676,0.049011,0.075933,0.033215,0.013176,0.097497,0.005015,0.153166,0.007358,...,0.038768,0.230569,0.007125,0.021209,0.011203,0.060266,0.128375,0.005911,0.004645,0.04253
2,ACH-000005,0.053957,0.027968,0.010139,0.005448,0.018599,0.081636,0.005457,0.159904,0.050884,...,0.017479,0.274568,0.054525,0.033396,0.033416,0.034712,0.092832,0.012482,0.020843,0.050412
3,ACH-000007,0.026704,0.083588,0.008853,0.011299,0.027288,0.028349,0.032573,0.166503,0.047045,...,0.196233,0.615338,0.011212,0.0022,0.005432,0.035241,0.138445,0.103161,0.146222,0.274833
4,ACH-000009,0.059383,0.051826,0.01537,0.011721,0.030062,0.078373,0.042128,0.184783,0.032335,...,0.152385,0.405712,0.029011,0.002816,0.056461,0.18955,0.328064,0.035161,0.058402,0.269194


In [16]:
# searching for similar IDs FROM the training_df_IDs IN the dependency_df
training_df_IDs = training_df_IDs["DepMap_ID"].tolist()
training_df_IDs = set(training_df_IDs) & set(dependency_df["DepMap_ID"].tolist())

In [17]:
training_df = dependency_df.loc[dependency_df["DepMap_ID"].isin(training_df_IDs)].reset_index(drop=True)
print(training_df.shape)
training_df.head(3)

(752, 17387)


Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
0,ACH-000001,0.094568,0.012519,0.02746,0.025962,0.073412,0.02734,0.020199,0.284733,0.022084,...,0.037449,0.080585,0.034309,0.007142,0.004241,0.082956,0.012,0.003592,0.012679,0.324623
1,ACH-000004,0.012676,0.049011,0.075933,0.033215,0.013176,0.097497,0.005015,0.153166,0.007358,...,0.038768,0.230569,0.007125,0.021209,0.011203,0.060266,0.128375,0.005911,0.004645,0.04253
2,ACH-000005,0.053957,0.027968,0.010139,0.005448,0.018599,0.081636,0.005457,0.159904,0.050884,...,0.017479,0.274568,0.054525,0.033396,0.033416,0.034712,0.092832,0.012482,0.020843,0.050412


In [18]:
# searching for similar IDs FROM the testing_df_IDs IN the dependency_df
testing_df_IDs = testing_df_IDs["DepMap_ID"].tolist()
testing_df_IDs = set(testing_df_IDs) & set(dependency_df["DepMap_ID"].tolist())

In [19]:
testing_df = dependency_df.loc[dependency_df["DepMap_ID"].isin(testing_df_IDs)].reset_index(drop=True)
print(testing_df.shape)
testing_df

(132, 17387)


Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
0,ACH-001283,0.055596,0.036007,0.009858,0.028605,0.037692,0.023283,0.006003,0.085478,0.042312,...,0.023996,0.891290,0.050396,0.005770,0.060573,0.040501,0.074070,0.019591,0.074594,0.114106
1,ACH-001306,0.012228,0.018525,0.051670,0.033013,0.029040,0.054553,0.021434,0.179698,0.018969,...,0.009830,0.795730,0.081199,0.033106,0.079632,0.019996,0.114254,0.067643,0.438162,0.354005
2,ACH-001307,0.021302,0.053004,0.015499,0.012622,0.032093,0.112495,0.023954,0.289130,0.021719,...,0.060852,0.579275,0.019355,0.026294,0.011660,0.034875,0.042847,0.074701,0.023539,0.290621
3,ACH-001318,0.014458,0.361292,0.011882,0.034442,0.056600,0.035331,0.032115,0.148083,0.012011,...,0.008115,0.156073,0.027273,0.012583,0.009224,0.022506,0.374102,0.011786,0.233281,0.174486
4,ACH-001328,0.048325,0.022217,0.020063,0.156373,0.014073,0.073399,0.015388,0.120418,0.038080,...,0.305541,0.898522,0.020248,0.019331,0.036575,0.062426,0.045324,0.012496,0.123419,0.185837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,ACH-002294,0.018806,0.193885,0.023309,0.003012,0.044203,0.006119,0.004223,0.428751,0.046432,...,0.519790,0.916306,0.074183,0.017970,0.013641,0.013803,0.020374,0.004955,0.026413,0.153037
128,ACH-002295,0.040305,0.039490,0.014371,0.009585,0.100523,0.074759,0.018957,0.768867,0.008879,...,0.098210,0.816771,0.007672,0.008754,0.057405,0.044378,0.069451,0.032748,0.126714,0.178790
129,ACH-002296,0.039897,0.026515,0.043682,0.020409,0.030543,0.021150,0.025860,0.946744,0.025770,...,0.006500,0.943228,0.013630,0.015707,0.003522,0.043838,0.132901,0.007707,0.014290,0.450628
130,ACH-002297,0.012731,0.149681,0.023636,0.012174,0.160540,0.047253,0.042571,0.804064,0.016870,...,0.347257,0.666775,0.023849,0.005700,0.040823,0.018247,0.060098,0.032896,0.106162,0.090488
