In [1]:
# -------- #
# PACKAGES #
# -------- #
import time
import pickle
import torch
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt 
import seaborn           as sns

# ---------------- #
# GENERAL SETTINGS #
# ---------------- #
torch.manual_seed(42)
sns.set_theme(style="white")

# ---------------- #
# GLOBAL VARIABLES #
# ---------------- #
from config import (
    # Drug features.
    PATH_TO_SAVED_DRUG_FEATURES,
    DRUG_FPS_FINAL_FILE_NAME,
    # Cell line features.
    PATH_TO_SAVED_CL_FEATURES,
    GENE_EXPR_FINAL_FILE_NAME,
    CNV_GISTIC_FINAL_FILE_NAME,
    CNV_PICNIC_FINAL_FILE_NAME    
)

---

# Building a Baseline Model

The objective of this notebook is to research on how to build a baseline model to test the future GNN approach against. It should be a Deep Learning model for Drug Response Prediction with the following general structure.

<p align="center">
    <img src="imgs/GeneralDLModelForDRP.png" alt="drawing" width="400"/>
</p>

---

## Cell Features

In [2]:
# --------------- #
# GENE EXPRESSION #
# --------------- #
start = time.time()
gene_expr = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{GENE_EXPR_FINAL_FILE_NAME}')
print(f"Shape: {gene_expr.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(gene_expr.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(gene_expr.columns[14:])}")
gene_expr.head(3)

Shape: (446521, 922) took 2.11200 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 908


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,1,MC-CAR,0.982114,3137,2.395685,2.0,49,544404,GDSC1,-0.189576,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
9,1,ES3,0.984816,2366,3.140923,2.0,1342,404197,GDSC1,0.508635,...,5.99576,9.337588,7.468226,3.71627,5.363887,6.188079,12.281947,4.794624,3.588528,6.785201
27,1,ES5,0.985693,2368,3.968757,2.0,610,797378,GDSC1,1.284229,...,6.939741,8.688176,7.085349,3.688222,4.572119,6.34509,12.276166,4.114092,5.768098,7.505155


In [3]:
# --------------------- #
# COPY NUMBER VARIATION #
# --------------------- #
# Copy number variation gistic final dataset.
start = time.time()
final_cnv_gistic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_GISTIC_FINAL_FILE_NAME}')
print(f"CNV Gistic: {final_cnv_gistic.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(final_cnv_gistic.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(final_cnv_gistic.columns[14:])}")

# Copy number variation picnic final dataset.
start = time.time()
final_cnv_picnic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_PICNIC_FINAL_FILE_NAME}')
print(f"CNV Picnic: {final_cnv_picnic.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(final_cnv_picnic.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(final_cnv_picnic.columns[14:])}")
final_cnv_picnic.head(3)

CNV Gistic: (446521, 952) took 96.52291 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 938
CNV Picnic: (446521, 980) took 95.01999 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 966


Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,...,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,...,2.0,3.0,3.0,3.0,4.0,6.0,4.0,3.0,4.0,3.0


- All dataframes have the same number of unique cell-lines.

In [23]:
non_gene_cols = [
    'DRUG_ID', 'CELL_LINE_NAME', 'AUC', 'CELL_ID', 'LN_IC50', 'CONC',
    'MASTER_CELL_ID', 'INTENSITY', 'DATASET', 'Z_SCORE', 'DRUG_NAME',
    'RMSE', 'COSMIC_ID', 'POSITION'
]
# Check if all non-gene symbol columns are the same.
assert len(non_gene_cols) == len(list(set(gene_expr.columns[:14]).intersection(set(final_cnv_gistic.columns[:14])).intersection(set(final_cnv_picnic.columns[:14]))))
assert set(gene_expr.columns[:14]) == set(final_cnv_gistic.columns[:14]) == set(final_cnv_picnic.columns[:14])

In [24]:
# Only select the gene symbol columns.
gene_cols__geneexpr = gene_expr.loc[:, ~gene_expr.columns.isin(non_gene_cols)]
gene_cols__cnvgistic = final_cnv_gistic.loc[:, ~final_cnv_gistic.columns.isin(non_gene_cols)]
gene_cols__cnvpicnic = final_cnv_picnic.loc[:, ~final_cnv_picnic.columns.isin(non_gene_cols)]

In [40]:
# Gene symbols which are in all cell feature dataframes.
in_all = len(set(gene_cols__geneexpr.columns).intersection(\
             set(gene_cols__cnvgistic.columns)).intersection(\
             set(gene_cols__cnvpicnic.columns)))
print(f"""The number of gene columns which are existent in all dataframes is:\n{in_all}
This corresponds to
    {100*in_all/len(gene_cols__geneexpr.columns):2.2f} % of the gene expression gene columns.
    {100*in_all/len(gene_cols__cnvgistic.columns):2.2f} % of the cnv gistic      gene columns.
    {100*in_all/len(gene_cols__cnvpicnic.columns):2.2f} % of the cnv picnic      gene columns.""")             

The number of gene columns which are existent in all dataframes is:
870
This corresponds to
    95.81 % of the gene expression gene columns.
    92.75 % of the cnv gistic      gene columns.
    90.06 % of the cnv picnic      gene columns.


---

## Drug Features

In [3]:
with open(f'{PATH_TO_SAVED_DRUG_FEATURES}drug_name_fingerprints_dataframe.pkl', 'rb') as f:
    drug_name_fps = pickle.load(f)
print(f"Shape: {drug_name_fps.shape}")
drug_name_fps.head(3)   

Shape: (367, 257)


Unnamed: 0,drug_name,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,(5Z)-7-Oxozeaenol,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
1,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A-443654,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


The encoded drug feature dataset has 367 unique drug names and per drug name a 256 sized De-Morgan fingerprint.