In [1]:
import sys
import time
import os
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import torch
import torch.nn as nn

from io import BytesIO
from time import sleep 
from tqdm import tqdm

from config import (
    # Drug features.
    PATH_TO_SAVED_DRUG_FEATURES,
    DRUG_FPS_FINAL_FILE_NAME,
    # Cell line features.
    PATH_TO_SAVED_CL_FEATURES,
    GENE_EXPR_FINAL_FILE_NAME,
    CNV_GISTIC_FINAL_FILE_NAME,
    CNV_PICNIC_FINAL_FILE_NAME    
)

sns.set_theme(style="white")

print(torch.__version__)
print(torch.cuda.is_available())

1.11.0
False


---

# Model using GNN for Cell Encoding

To build a model which uses a GNN for the cell-line encoding a corresponding fitting dataset is needed. The drug dataset can stay as is for now.

In [2]:
# ----------------------- #
# IMPORT FEATURE DATASETS #
# ----------------------- #
# Gene expression.
start = time.time()
gene_expr = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{GENE_EXPR_FINAL_FILE_NAME}')
print(f"""Gene Expression: {gene_expr.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(gene_expr.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(gene_expr.columns[14:])}
""")

# CNV gistic.
start = time.time()
cnv_gistic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_GISTIC_FINAL_FILE_NAME}')
print(f"""CNV Gistic: {cnv_gistic.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(cnv_gistic.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(cnv_gistic.columns[14:])}
""")

# CNV picnic.
start = time.time()
cnv_picnic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_PICNIC_FINAL_FILE_NAME}')
print(f"""CNV Picnic: {cnv_picnic.shape} took {time.time()-start:.5f} seconds.
    Number of unique cell-lines : {len(np.unique(cnv_picnic.CELL_LINE_NAME.values))}
    Number of gene columns      : {len(cnv_picnic.columns[14:])}
""")

Gene Expression: (446521, 922) took 3.51627 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 908

CNV Gistic: (446521, 952) took 110.35721 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 938

CNV Picnic: (446521, 980) took 117.35387 seconds.
    Number of unique cell-lines : 988
    Number of gene columns      : 966



In [3]:
# Only the intersection genes of the datasets are making sense to use for the graph since 
# all graphs should have the exact same structure per cell-line.
intersection_cell_lines = set(np.unique(gene_expr.CELL_LINE_NAME.values).tolist()) \
    .intersection(set(np.unique(cnv_gistic.CELL_LINE_NAME.values).tolist())) \
    .intersection(set(np.unique(cnv_picnic.CELL_LINE_NAME.values).tolist()))
print(f"There are {len(intersection_cell_lines):4.0f} intersecting cell lines in the 3 datasets.")   

intersection_genes = set(gene_expr.columns[14:]) \
    .intersection(set(cnv_gistic.columns[14:])) \
    .intersection(set(cnv_picnic.columns[14:]))
print(f"There are {len(intersection_genes):4.0f} intersecting genes in the 3 datasets.")  

There are  988 intersecting cell lines in the 3 datasets.
There are  870 intersecting genes in the 3 datasets.


In [7]:
# Only take a single row per cell-line, since the features are the same,
# only another drug was tested on it, thus a different IC50 value.
gene_expr_v2 = gene_expr.groupby(['CELL_LINE_NAME']).first().reset_index()
cnv_gistic_v2 = cnv_gistic.groupby(['CELL_LINE_NAME']).first().reset_index()
cnv_picnic_v2 = cnv_picnic.groupby(['CELL_LINE_NAME']).first().reset_index()

print(f"""
    Shape after removing duplicate feature value rows for...
        ... gene expression : {gene_expr_v2.shape}
        ... cnv gistic      : {cnv_gistic_v2.shape}
        ... cnv picnic      : {cnv_picnic_v2.shape}
""")

# Only take the intersection genes which are in all feature dataset.
gene_expr_v3 = gene_expr_v2[list(gene_expr_v2.columns[:14]) + list(intersection_genes)]
cnv_gistic_v3 = cnv_gistic_v2[list(cnv_gistic_v2.columns[:14]) + list(intersection_genes)]
cnv_picnic_v3 = cnv_picnic_v2[list(cnv_picnic_v2.columns[:14]) + list(intersection_genes)]
print(f"""
    Shape after only taking the intersection genes...
        ... gene expression : {gene_expr_v3.shape}
        ... cnv gistic      : {cnv_gistic_v3.shape}
        ... cnv picnic      : {cnv_picnic_v3.shape}
""")

# Assert that all cell-line rows are unique.
assert gene_expr_v3.shape[0] == len(list(np.unique(gene_expr_v3.CELL_LINE_NAME)))
assert cnv_gistic_v3.shape[0] == len(list(np.unique(cnv_gistic_v3.CELL_LINE_NAME)))
assert cnv_picnic_v3.shape[0] == len(list(np.unique(cnv_picnic_v3.CELL_LINE_NAME)))

# Unique cell-line names. 
uniq_cell_line_names = list(np.unique(gene_expr_v3.CELL_LINE_NAME))
print(f"Number of unique cell-line names: {len(uniq_cell_line_names)}")


    Shape after removing duplicate feature value rows for...
        ... gene expression : (988, 922)
        ... cnv gistic      : (988, 952)
        ... cnv picnic      : (988, 980)


    Shape after only taking the intersection genes...
        ... gene expression : (988, 884)
        ... cnv gistic      : (988, 884)
        ... cnv picnic      : (988, 884)

Number of unique cell-line names: 988
