In [7]:
%reset

In [9]:
import sys
import time
import os
import gzip
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import torch
import torch.nn as nn

from io import BytesIO
from time import sleep 
from tqdm import tqdm

from config import (
    PATH_TO_FEATURES    
)

sns.set_theme(style="white")

print(torch.__version__)
print(torch.cuda.is_available())

1.11.0
False


---

### Loading the tabular datasets

In [10]:
# -------------------------------------- #
# Read cell-line - gene feature datasets #
# -------------------------------------- #
READ = True

if READ:
    start = time.time()
    with open(f'{PATH_TO_FEATURES}gexpr_sparse.pkl', 'rb') as f: 
        gexpr = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}cnvg_sparse.pkl', 'rb') as f: 
        cnvg = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}cnvp_sparse.pkl', 'rb') as f: 
        cnvp = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}mut_sparse.pkl', 'rb') as f: 
        mut = pickle.load(f)  
    print(f"Took {time.time()-start:.5f} seconds to read the cell-line feature datasets.")
    print(f"""Shapes
    Gene Expression : {gexpr.shape}
    CNV Gistic      : {cnvg.shape}
    CNV Picnic      : {cnvp.shape}
    Mutation        : {mut.shape}
    """)

# ----------------------------------------- #
# Read cell-line, drug and ln(IC50) dataset #
# ----------------------------------------- #
if READ: 
    start = time.time()
    with open(f'{PATH_TO_FEATURES}drugs_sparse.pkl', 'rb') as f: 
        drug_responses = pickle.load(f) 
    print(f"Took {time.time()-start:.5f} seconds to read cell-line-drug-ic50 dataset.")
    print(f"Shape: {drug_responses.shape}")

Took 0.01344 seconds to read the cell-line feature datasets.
Shapes
    Gene Expression : (983, 859)
    CNV Gistic      : (983, 859)
    CNV Picnic      : (983, 859)
    Mutation        : (983, 859)
    
Took 0.03011 seconds to read cell-line-drug-ic50 dataset.
Shape: (310904, 4)


In [22]:
def get_uniqs(df: pd.DataFrame, col: str):
    return np.unique(df[col].values).tolist()

# Test that all feature datasets contains exactly the same cell-lines.
gexpr_cls = get_uniqs(gexpr, 'CELL_LINE_NAME')
cnvg_cls = get_uniqs(cnvg, 'CELL_LINE_NAME')
cnvp_cls = get_uniqs(cnvp, 'CELL_LINE_NAME')
mut_cls = get_uniqs(mut, 'CELL_LINE_NAME')
inter_cls = set(gexpr_cls) \
    .intersection(set(cnvp_cls)) \
    .intersection(set(cnvp_cls)) \
    .intersection(set(mut_cls))

assert len(inter_cls) == len(gexpr_cls) == len(cnvg_cls) == len(cnvp_cls) == len(mut_cls), \
    "Not all feature datasets contain the exact same cell-lines as rows!"

# Test that all feature datasets contains exactly the same gene symbols.
inter_genes = set(np.unique(gexpr.columns.values).tolist()) \
    .intersection(set(np.unique(cnvg.columns.values).tolist())) \
    .intersection(set(np.unique(cnvp.columns.values).tolist())) \
    .intersection(set(np.unique(mut.columns.values).tolist()))

assert len(inter_genes) == \
    len(np.unique(gexpr.columns.values).tolist()) == \
    len(np.unique(cnvg.columns.values).tolist()) == \
    len(np.unique(cnvp.columns.values).tolist()) == \
    len(np.unique(mut.columns.values).tolist()), \
        "Not all feature datasets contain the exact same gene symbols as columns!"

del gexpr_cls, cnvg_cls, cnvp_cls, mut_cls, inter_cls, inter_genes

In [11]:
gexpr.head(5)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,7.821536,3.601622,3.225596,3.651201,7.895763,3.953414,4.059382,4.376822,3.215209,9.267565
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,8.094289,3.596762,3.486299,3.127452,7.852436,3.869411,4.248318,4.989945,4.328643,9.51587
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,7.984052,3.317746,5.106906,5.305024,6.508066,7.840349,8.632889,4.792137,3.078971,8.495921
3,451Lu,6.518083,5.46252,4.436039,3.241076,6.657767,7.085021,8.199066,10.550334,5.83756,...,6.970153,3.69546,4.947532,5.198087,7.361412,3.687171,5.965388,4.885217,3.849932,9.726323
4,5637,6.855088,5.980778,4.382524,6.086206,7.423409,8.12018,5.212472,10.329122,7.212325,...,7.068021,3.386432,6.835862,6.190242,7.431739,3.652418,7.088577,4.76185,3.163742,9.091481


In [12]:
drug_responses.head(5)

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DATASET,LN_IC50
190089,201T,133,GDSC1,-3.770673
198783,201T,134,GDSC1,-0.81418
207405,201T,135,GDSC1,-0.29805
216171,201T,136,GDSC1,-4.472378
224883,201T,140,GDSC1,-5.332884


## Build graph

In [3]:
# Only the intersection genes of the datasets are making sense to use for the graph since 
# all graphs should have the exact same structure per cell-line.
intersection_cell_lines = set(np.unique(gene_expr.CELL_LINE_NAME.values).tolist()) \
    .intersection(set(np.unique(cnv_gistic.CELL_LINE_NAME.values).tolist())) \
    .intersection(set(np.unique(cnv_picnic.CELL_LINE_NAME.values).tolist()))
print(f"There are {len(intersection_cell_lines):4.0f} intersecting cell lines in the 3 datasets.")   

intersection_genes = set(gene_expr.columns[14:]) \
    .intersection(set(cnv_gistic.columns[14:])) \
    .intersection(set(cnv_picnic.columns[14:]))
print(f"There are {len(intersection_genes):4.0f} intersecting genes in the 3 datasets.")  

There are  988 intersecting cell lines in the 3 datasets.
There are  870 intersecting genes in the 3 datasets.


In [7]:
# Only take a single row per cell-line, since the features are the same,
# only another drug was tested on it, thus a different IC50 value.
gene_expr_v2 = gene_expr.groupby(['CELL_LINE_NAME']).first().reset_index()
cnv_gistic_v2 = cnv_gistic.groupby(['CELL_LINE_NAME']).first().reset_index()
cnv_picnic_v2 = cnv_picnic.groupby(['CELL_LINE_NAME']).first().reset_index()

print(f"""
    Shape after removing duplicate feature value rows for...
        ... gene expression : {gene_expr_v2.shape}
        ... cnv gistic      : {cnv_gistic_v2.shape}
        ... cnv picnic      : {cnv_picnic_v2.shape}
""")

# Only take the intersection genes which are in all feature dataset.
gene_expr_v3 = gene_expr_v2[list(gene_expr_v2.columns[:14]) + list(intersection_genes)]
cnv_gistic_v3 = cnv_gistic_v2[list(cnv_gistic_v2.columns[:14]) + list(intersection_genes)]
cnv_picnic_v3 = cnv_picnic_v2[list(cnv_picnic_v2.columns[:14]) + list(intersection_genes)]
print(f"""
    Shape after only taking the intersection genes...
        ... gene expression : {gene_expr_v3.shape}
        ... cnv gistic      : {cnv_gistic_v3.shape}
        ... cnv picnic      : {cnv_picnic_v3.shape}
""")

# Assert that all cell-line rows are unique.
assert gene_expr_v3.shape[0] == len(list(np.unique(gene_expr_v3.CELL_LINE_NAME)))
assert cnv_gistic_v3.shape[0] == len(list(np.unique(cnv_gistic_v3.CELL_LINE_NAME)))
assert cnv_picnic_v3.shape[0] == len(list(np.unique(cnv_picnic_v3.CELL_LINE_NAME)))

# Unique cell-line names. 
uniq_cell_line_names = list(np.unique(gene_expr_v3.CELL_LINE_NAME))
print(f"Number of unique cell-line names: {len(uniq_cell_line_names)}")


    Shape after removing duplicate feature value rows for...
        ... gene expression : (988, 922)
        ... cnv gistic      : (988, 952)
        ... cnv picnic      : (988, 980)


    Shape after only taking the intersection genes...
        ... gene expression : (988, 884)
        ... cnv gistic      : (988, 884)
        ... cnv picnic      : (988, 884)

Number of unique cell-line names: 988
