In [2]:
# -------- #
# PACKAGES #
# -------- #
import time
import pickle
import torch
import torch.nn          as nn
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt 
import seaborn           as sns

from torch.utils.data import Dataset, DataLoader

# ---------------- #
# GENERAL SETTINGS #
# ---------------- #
torch.manual_seed(42)
sns.set_theme(style="white")

# ---------------- #
# GLOBAL VARIABLES #
# ---------------- #
from config import (
    # Drug features.
    PATH_TO_SAVED_DRUG_FEATURES,
    DRUG_FPS_FINAL_FILE_NAME,
    # Cell line features.
    PATH_TO_SAVED_CL_FEATURES,
    GENE_EXPR_FINAL_FILE_NAME,
    CNV_GISTIC_FINAL_FILE_NAME,
    CNV_PICNIC_FINAL_FILE_NAME    
)

---

# Building a Baseline Model

The objective of this notebook is to research on how to build a baseline model to test the future GNN approach against. It should be a Deep Learning model for Drug Response Prediction with the following general structure.

<p align="center">
    <img src="imgs/GeneralDLModelForDRP.png" alt="drawing" width="400"/>
</p>

---

## Cell Features

In [3]:
# --------------- #
# GENE EXPRESSION #
# --------------- #
start = time.time()
gene_expr = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{GENE_EXPR_FINAL_FILE_NAME}')
print(f"Shape: {gene_expr.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(gene_expr.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(gene_expr.columns[14:])}")
gene_expr.head(3)

Shape: (446521, 922) took 4.11994 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 908


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,1,MC-CAR,0.982114,3137,2.395685,2.0,49,544404,GDSC1,-0.189576,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
9,1,ES3,0.984816,2366,3.140923,2.0,1342,404197,GDSC1,0.508635,...,5.99576,9.337588,7.468226,3.71627,5.363887,6.188079,12.281947,4.794624,3.588528,6.785201
27,1,ES5,0.985693,2368,3.968757,2.0,610,797378,GDSC1,1.284229,...,6.939741,8.688176,7.085349,3.688222,4.572119,6.34509,12.276166,4.114092,5.768098,7.505155


In [4]:
gene_expr[gene_expr.CELL_LINE_NAME=='MC-CAR'].head(5)

Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,1,MC-CAR,0.982114,3137,2.395685,2.0,49,544404,GDSC1,-0.189576,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
4563,3,MC-CAR,0.980891,3137,-0.658244,0.1,49,558232,GDSC1,0.806489,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
8613,5,MC-CAR,0.903979,3137,2.161095,8.0,49,333014,GDSC1,-0.206663,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
12843,6,MC-CAR,0.986077,3137,2.613997,2.0,49,635581,GDSC1,-0.137,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
17478,9,MC-CAR,0.937027,3137,0.530615,1.0,49,423727,GDSC1,0.410332,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366


- Note that for the same `CELL_LINE_NAME` the gene expression value is the same.
- It only differs for a different `DRUG_ID` for that specific cell line.
- The `LN_IC50` is therefore different for each different cell-line - drug tuple.

Thus, 
- each row has different ic50 value
- not each row has a different value per gene column

In [5]:
# --------------------- #
# COPY NUMBER VARIATION #
# --------------------- #
# Copy number variation gistic final dataset.
start = time.time()
final_cnv_gistic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_GISTIC_FINAL_FILE_NAME}')
print(f"CNV Gistic: {final_cnv_gistic.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(final_cnv_gistic.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(final_cnv_gistic.columns[14:])}")

# Copy number variation picnic final dataset.
start = time.time()
final_cnv_picnic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_PICNIC_FINAL_FILE_NAME}')
print(f"CNV Picnic: {final_cnv_picnic.shape} took {time.time()-start:.5f} seconds.")
print(f"  Number of unique cell-lines : {len(np.unique(final_cnv_picnic.CELL_LINE_NAME.values))}")
print(f"  Number of gene columns      : {len(final_cnv_picnic.columns[14:])}")
final_cnv_picnic.head(3)

CNV Gistic: (446521, 952) took 129.31703 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 938
CNV Picnic: (446521, 980) took 143.21013 seconds.
  Number of unique cell-lines : 988
  Number of gene columns      : 966


Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,...,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,...,2.0,3.0,3.0,3.0,4.0,6.0,4.0,3.0,4.0,3.0


- All dataframes have the same number of unique cell-lines.

In [6]:
non_gene_cols = [
    'DRUG_ID', 'CELL_LINE_NAME', 'AUC', 'CELL_ID', 'LN_IC50', 'CONC',
    'MASTER_CELL_ID', 'INTENSITY', 'DATASET', 'Z_SCORE', 'DRUG_NAME',
    'RMSE', 'COSMIC_ID', 'POSITION'
]
# Check if all non-gene symbol columns are the same.
assert len(non_gene_cols) == len(list(set(gene_expr.columns[:14]).intersection(set(final_cnv_gistic.columns[:14])).intersection(set(final_cnv_picnic.columns[:14]))))
assert set(gene_expr.columns[:14]) == set(final_cnv_gistic.columns[:14]) == set(final_cnv_picnic.columns[:14])

In [7]:
# Only select the gene symbol columns.
gene_cols__geneexpr = gene_expr.loc[:, ~gene_expr.columns.isin(non_gene_cols)]
gene_cols__cnvgistic = final_cnv_gistic.loc[:, ~final_cnv_gistic.columns.isin(non_gene_cols)]
gene_cols__cnvpicnic = final_cnv_picnic.loc[:, ~final_cnv_picnic.columns.isin(non_gene_cols)]

In [8]:
# Gene symbols which are in all cell feature dataframes.
in_all = set(gene_cols__geneexpr.columns).intersection(\
         set(gene_cols__cnvgistic.columns)).intersection(\
         set(gene_cols__cnvpicnic.columns))
len_in_all = len(in_all)
print(f"""The number of gene columns which are existent in all dataframes is:\n{len_in_all}
This corresponds to
    {100*len_in_all/len(gene_cols__geneexpr.columns):2.2f} % of the gene expression gene columns.
    {100*len_in_all/len(gene_cols__cnvgistic.columns):2.2f} % of the cnv gistic      gene columns.
    {100*len_in_all/len(gene_cols__cnvpicnic.columns):2.2f} % of the cnv picnic      gene columns.""")             

The number of gene columns which are existent in all dataframes is:
870
This corresponds to
    95.81 % of the gene expression gene columns.
    92.75 % of the cnv gistic      gene columns.
    90.06 % of the cnv picnic      gene columns.


In [9]:
# Only select the gene columns which are in all dataframes. 
keep = ['CELL_LINE_NAME']
gene_expr_v2 = gene_expr[keep + list(in_all)]
cnv_gistic_v2 = final_cnv_gistic[keep + list(in_all)]
cnv_picnic_v2 = final_cnv_picnic[keep + list(in_all)]

print(f"""After selecting only the genes which are existent in all dataset the shapes are:
    gene expr  : {gene_expr_v2.shape}
    cnv gistic : {cnv_gistic_v2.shape}
    cnv picnic : {cnv_picnic_v2.shape}
""")
gene_expr_v2.head(3)

After selecting only the genes which are existent in all dataset the shapes are:
    gene expr  : (446521, 871)
    cnv gistic : (446521, 871)
    cnv picnic : (446521, 871)



Unnamed: 0,CELL_LINE_NAME,DAG1,CREG1,DNAJB1,PIK3C3,DLD,FBXL12,PAK6,NR2F6,APPBP2,...,C5,MPC2,CDC45,VPS28,GSTZ1,GDPD5,WDTC1,HMGCS1,PIN1,IDE
0,MC-CAR,4.081657,5.677272,6.180353,8.571591,10.334329,7.50524,3.069739,3.299431,6.125682,...,4.049017,9.057813,6.903852,8.363889,6.544324,3.830996,3.461435,7.187368,5.498214,7.723952
9,ES3,4.19645,7.051789,5.371774,8.33016,10.126326,6.338445,3.057186,3.981936,7.588107,...,5.373765,8.097682,5.208866,9.77604,6.275633,3.354604,3.36688,5.926742,5.984129,6.754155
27,ES5,4.37125,7.685916,6.476477,8.960452,10.018826,6.803424,3.761746,4.115502,6.899499,...,3.907119,8.554381,6.21266,9.897498,4.892096,3.784822,3.413021,7.650277,5.590448,6.349301


In [10]:
# Only select one row per unique cell-line, since the feature values are the same anyways.
gene_expr_v3 = gene_expr_v2.drop_duplicates()  # same as .groupby('CELL_LINE_NAME').first()
cnv_gistic_v3 = cnv_gistic_v2.drop_duplicates()  # same as .groupby('CELL_LINE_NAME').first()
cnv_picnic_v3 = cnv_picnic_v2.drop_duplicates()  # same as .groupby('CELL_LINE_NAME').first()
print(f"""Shapes after removing duplicate cell-lines:
    gene expr  : {gene_expr_v3.shape}
    cnv gistic : {cnv_gistic_v3.shape}
    cnv picnic : {cnv_picnic_v3.shape}
""")

Shapes after removing duplicate cell-lines:
    gene expr  : (988, 871)
    cnv gistic : (988, 871)
    cnv picnic : (988, 871)



In [11]:
# Sort dataframes by cell-line column.
gene_expr_v4 = gene_expr_v3.sort_values(['CELL_LINE_NAME'])
cnv_gistic_v4 = cnv_gistic_v3.sort_values(['CELL_LINE_NAME'])
cnv_picnic_v4 = cnv_picnic_v3.sort_values(['CELL_LINE_NAME'])

# Check element-wise if the cell-line columns are exactly the same.
assert (gene_expr_v4.CELL_LINE_NAME == cnv_gistic_v4.CELL_LINE_NAME).all()
assert (cnv_gistic_v4.CELL_LINE_NAME == cnv_picnic_v4.CELL_LINE_NAME).all()

print(f"""Shapes after by the cell-line column:
    gene expr  : {gene_expr_v4.shape}
    cnv gistic : {cnv_gistic_v4.shape}
    cnv picnic : {cnv_picnic_v4.shape}
""")
gene_expr_v4.head(3)

Shapes after by the cell-line column:
    gene expr  : (988, 871)
    cnv gistic : (988, 871)
    cnv picnic : (988, 871)



Unnamed: 0,CELL_LINE_NAME,DAG1,CREG1,DNAJB1,PIK3C3,DLD,FBXL12,PAK6,NR2F6,APPBP2,...,C5,MPC2,CDC45,VPS28,GSTZ1,GDPD5,WDTC1,HMGCS1,PIN1,IDE
190089,201T,5.176508,10.016432,6.45321,6.949563,11.018893,6.608961,3.270474,4.646564,6.793711,...,8.544732,10.659183,3.837447,8.536009,4.919427,3.377033,3.31014,8.576079,4.915888,7.577092
188514,22RV1,4.531809,7.914579,6.339245,7.834045,9.595623,7.023759,4.668833,3.993249,6.527064,...,8.655075,11.065614,6.734605,8.691275,6.692639,3.462619,3.187604,8.353595,6.067534,6.678702
188316,23132-87,4.742977,9.960903,7.649852,8.561532,10.189863,6.714387,3.3496,5.553722,6.268291,...,5.834866,9.76639,4.359462,10.425257,6.662953,4.402,3.390102,6.73493,5.695096,7.719844


Now all datasets have a feature value for the specific gene (column) and cell-line (row).

$$
\left [ \begin{array}{ccccccc} 
           & g_1         & g_2         & \cdots & g_i         & \cdots & g_n \\
    c_1    & \begin{bmatrix} v_{ge}^{11} \\ v_{cg}^{11} \\ v_{cp}^{11} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{12} \\ v_{cg}^{12} \\ v_{cp}^{12} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{1i} \\ v_{cg}^{1i} \\ v_{cp}^{1i} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{1n} \\ v_{cg}^{1n} \\ v_{cp}^{1n} \end{bmatrix} 
           \\ 
    c_2    & \begin{bmatrix} v_{ge}^{21} \\ v_{cg}^{21} \\ v_{cp}^{21} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{22} \\ v_{cg}^{22} \\ v_{cp}^{22} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{2i} \\ v_{cg}^{2i} \\ v_{cp}^{2i} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{2n} \\ v_{cg}^{2n} \\ v_{cp}^{2n} \end{bmatrix} 
           \\ 
    \vdots & \vdots      & \vdots      & \ddots & \vdots      & \ddots & \vdots \\
    c_j    & \begin{bmatrix} v_{ge}^{j1} \\ v_{cg}^{j1} \\ v_{cp}^{j1} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{j2} \\ v_{cg}^{j2} \\ v_{cp}^{j2} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{ji} \\ v_{cg}^{ji} \\ v_{cp}^{ji} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{jn} \\ v_{cg}^{jn} \\ v_{cp}^{jn} \end{bmatrix} 
           \\   
    \vdots & \vdots      & \vdots      & \ddots & \vdots      & \ddots & \vdots \\
    c_m    & \begin{bmatrix} v_{ge}^{m1} \\ v_{cg}^{m1} \\ v_{cp}^{m1} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{m2} \\ v_{cg}^{m2} \\ v_{cp}^{m2} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{mi} \\ v_{cg}^{mi} \\ v_{cp}^{mi} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{mn} \\ v_{cg}^{mn} \\ v_{cp}^{mn} \end{bmatrix} 
           \\            
\end{array} \right ]
$$
where 
$$
       c_j := \text{cell line} \; j \\
       g_i := \text{gene} \; i \\
       v_{ge}^{ji} := \text{Gene Expression value of gene} \; i \; \text{ in cell line} \; j \\
       v_{cg}^{ji} := \text{Gistic Copy Number value of gene} \; i \; \text{ in cell line} \; j \\
       v_{cp}^{ji} := \text{Picnic Copy Number value of gene} \; i \; \text{ in cell line} \; j \\
       
$$

- One example is for one cell-line.
- we have the same number of features and genes per cell-line.
- in total we have 446,521 examples we can use to train the model
- out of which 988 are unique cell-lines.

In [12]:
"""
1st dim = features
2nd dim = cell-lines
3rd dim = genes
"""
gene_tensor = torch.tensor([
    gene_expr_v4.loc[:, gene_expr_v4.columns != 'CELL_LINE_NAME'].to_numpy(dtype=np.float64), 
    cnv_gistic_v4.loc[:, cnv_gistic_v4.columns != 'CELL_LINE_NAME'].to_numpy(dtype=np.float64), 
    cnv_picnic_v4.loc[:, cnv_picnic_v4.columns != 'CELL_LINE_NAME'].to_numpy(dtype=np.float64)
])
print(f"Gene tensor shape: {gene_tensor.shape}")
gene_tensor

Gene tensor shape: torch.Size([3, 988, 870])


  gene_tensor = torch.tensor([


tensor([[[ 5.1765, 10.0164,  6.4532,  ...,  8.5761,  4.9159,  7.5771],
         [ 4.5318,  7.9146,  6.3392,  ...,  8.3536,  6.0675,  6.6787],
         [ 4.7430,  9.9609,  7.6499,  ...,  6.7349,  5.6951,  7.7198],
         ...,
         [ 4.3330, 10.4536,  6.2461,  ...,  9.2214,  4.8239,  8.1262],
         [ 5.1371,  7.5222,  6.8732,  ...,  8.7492,  6.0484,  7.3900],
         [ 4.8598,  7.7326,  6.5240,  ...,  6.5641,  5.1952,  6.8858]],

        [[ 1.0000,  1.0000, -1.0000,  ...,  1.0000, -1.0000,  0.0000],
         [ 0.0000,  1.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  1.0000, -1.0000,  ...,  0.0000, -1.0000,  0.0000],
         ...,
         [ 0.0000,  1.0000, -1.0000,  ...,  0.0000, -1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000, -1.0000],
         [ 1.0000,  0.0000, -1.0000,  ...,  2.0000, -1.0000,  0.0000]],

        [[ 3.0000,  5.0000,  3.0000,  ...,  5.0000,  3.0000,  3.0000],
         [ 2.0000,  3.0000,  2.0000,  ...,  2

__Note__:
- 1st Dimension: (the Features)
  - `gene_tensor[0]`: all cell-line genes values for the Gene Expression feature.
  - `gene_tensor[1]`: all cell-line genes values for the CNV Gistic feature.
  - `gene_tensor[2]`: all cell-line genes values for the CNV Picnic feature. 
- 2nd Dimension: (the cell-lines)
  - `gene_tensor[0][0]`: all Gene Expresion values for all genes for the first cell-line _201T_.
  - `gene_tensor[1][0]`: all CNV Gistic values for all genes for the first cell-line _201T_.
  - `gene_tensor[2][0]`: all CNV Picnic values for all genes for the first cell-line _201T_.
- 3rd Dimension: (the genes)
  - `gene_tensor[0][0][0]`: the Gene Expression value for the first cell-line _201T_ and the first gene _RRP1B_.
  - `gene_tensor[1][0][0]`: the CNV Gistic value for the first cell-line _201T_ and the first gene _RRP1B_.
  - `gene_tensor[2][0][0]`: the CNV Picnic value for the first cell-line _201T_ and the first gene _RRP1B_.

Each 2nd dimension sample (cell-line) is a training/testing example for the model.

In [112]:
# An example cell-line
CELL_LINE_INDEX = 5
ex1 = gene_tensor[:, CELL_LINE_INDEX, :]
ex1_flattened = ex1.flatten()
print(f"""
Before flattening : {ex1.shape}
After  flattening : {ex1_flattened.shape}
""")


Before flattening : torch.Size([3, 870])
After  flattening : torch.Size([2610])



The flattened vector for the specific `CELL_LINE_NAME` will be the input of the NN.

In [124]:
# An example cell-line
ex1 = gene_tensor[:, 5:7, :]
ex1_flattened = torch.flatten(ex1, start_dim=0, end_dim=1)
print(f"""
Before flattening : {ex1.shape}
After  flattening : {ex1_flattened.shape}
""")


Before flattening : torch.Size([3, 2, 870])
After  flattening : torch.Size([6, 870])



In [162]:
# An example cell-line
ex2 = gene_tensor[:, 5:7, :]

num_features   = ex2.shape[0]
num_cell_lines = ex2.shape[1]
num_genes      = ex2.shape[2]

ex2_ravel = np.ravel(ex2, order='F')
ex2_flattened = np.reshape(ex2_ravel, (num_cell_lines, num_features * num_genes))
print(f"""
Initial  : {ex2.shape}
Raveled  : {ex2_ravel.shape}
Reshaped : {ex2_flattened.shape}
""")


Initial  : torch.Size([3, 2, 870])
Raveled  : (5220,)
Reshaped : (2, 2610)



Now for the input we have/want
$$
\begin{array}{ccccccc} 
           & g_1         & g_2         & \cdots & g_i         & \cdots & g_n \\
    c_1    & \begin{bmatrix} v_{ge}^{11} & v_{cg}^{11} & v_{cp}^{11} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{12} & v_{cg}^{12} & v_{cp}^{12} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{1i} & v_{cg}^{1i} & v_{cp}^{1i} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{1n} & v_{cg}^{1n} & v_{cp}^{1n} \end{bmatrix} 
           \\ 
    c_2    & \begin{bmatrix} v_{ge}^{21} & v_{cg}^{21} & v_{cp}^{21} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{22} & v_{cg}^{22} & v_{cp}^{22} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{2i} & v_{cg}^{2i} & v_{cp}^{2i} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{2n} & v_{cg}^{2n} & v_{cp}^{2n} \end{bmatrix} 
           \\ 
    \vdots & \vdots      & \vdots      & \ddots & \vdots      & \ddots & \vdots \\
    c_j    & \begin{bmatrix} v_{ge}^{j1} & v_{cg}^{j1} & v_{cp}^{j1} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{j2} & v_{cg}^{j2} & v_{cp}^{j2} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{ji} & v_{cg}^{ji} & v_{cp}^{ji} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{jn} & v_{cg}^{jn} & v_{cp}^{jn} \end{bmatrix} 
           \\   
    \vdots & \vdots      & \vdots      & \ddots & \vdots      & \ddots & \vdots \\
    c_m    & \begin{bmatrix} v_{ge}^{m1} & v_{cg}^{m1} & v_{cp}^{m1} \end{bmatrix} 
           & \begin{bmatrix} v_{ge}^{m2} & v_{cg}^{m2} & v_{cp}^{m2} \end{bmatrix} 
           & \cdots 
           & \begin{bmatrix} v_{ge}^{mi} & v_{cg}^{mi} & v_{cp}^{mi} \end{bmatrix}
           & \cdots 
           & \begin{bmatrix} v_{ge}^{mn} & v_{cg}^{mn} & v_{cp}^{mn} \end{bmatrix} 
           \\            
\end{array}
$$

Each cell-line row example has the dimension $(1, \text{numFeatures} * \text{numGenes})$ which is $(1, 3*870)=(1, 2610)$.

In [13]:
NUM_FEATURES   = gene_tensor.shape[0] 
NUM_CELL_LINES = gene_tensor.shape[1]
NUM_GENES      = gene_tensor.shape[2]
print(f""""
    Number of features   : {NUM_FEATURES}
    Number of cell-lines : {NUM_CELL_LINES}
    Number of genes      : {NUM_GENES}
""")

# TODO: Now we have only the unique cell-lines. Do the same for all cell-lines to make it able to concat back to the drugs later on.

"
    Number of features   : 3
    Number of cell-lines : 988
    Number of genes      : 870



In [176]:
# ----------------- #
# CREATE THE INPUTS #
# ----------------- #
WHOLE_DATASET_INPUT = torch.from_numpy(np.reshape(a=np.ravel(gene_tensor, order='F'),
                                                  newshape=(NUM_CELL_LINES, NUM_FEATURES * NUM_GENES)))
print(f"Shape of the whole dataset: {WHOLE_DATASET_INPUT.shape}")

Shape of the whole dataset: torch.Size([988, 2610])


In [20]:
# --------------------- #
# CREATE THE GENE LAYER #
# --------------------- #
used_genes = list(gene_expr_v4.loc[:, gene_expr_v4.columns != 'CELL_LINE_NAME'].columns)
MAPPING_FROM_GENES_TO_INDICES = pd.DataFrame(used_genes, columns=['genes']).reset_index(level=0)

GENE_LAYER_VALUES = torch.tensor(MAPPING_FROM_GENES_TO_INDICES.index.values)
print(f"Gene layer shape : {GENE_LAYER_VALUES.shape}")
print(GENE_LAYER_VALUES[:5])
MAPPING_FROM_GENES_TO_INDICES.head(5)

Gene layer shape : torch.Size([870])
tensor([0, 1, 2, 3, 4])


Unnamed: 0,index,genes
0,0,DAG1
1,1,CREG1
2,2,DNAJB1
3,3,PIK3C3
4,4,DLD


In [4]:
# ---------------------------------------------- #
# CREATE THE GENE LAYER only for gene expression #
# ---------------------------------------------- #
non_gene_cols = [
    'DRUG_ID', 'CELL_LINE_NAME', 'AUC', 'CELL_ID', 'LN_IC50', 'CONC',
    'MASTER_CELL_ID', 'INTENSITY', 'DATASET', 'Z_SCORE', 'DRUG_NAME',
    'RMSE', 'COSMIC_ID', 'POSITION'
]

gene_cols__geneexpr = gene_expr.loc[:, ~gene_expr.columns.isin(non_gene_cols)]
gene_expr_v2 = gene_expr[['CELL_LINE_NAME'] + list(gene_cols__geneexpr.columns)].drop_duplicates()

used_genes = list(gene_expr_v2.loc[:, gene_expr_v2.columns != 'CELL_LINE_NAME'].columns)
MAPPING_FROM_GENES_TO_INDICES = pd.DataFrame(used_genes, columns=['genes']).reset_index(level=0)

GENE_LAYER_VALUES = torch.tensor(MAPPING_FROM_GENES_TO_INDICES.index.values)
print(f"Gene layer shape : {GENE_LAYER_VALUES.shape}")
print(GENE_LAYER_VALUES[:5])
MAPPING_FROM_GENES_TO_INDICES.head(5)

Gene layer shape : torch.Size([908])
tensor([0, 1, 2, 3, 4])


Unnamed: 0,index,genes
0,0,TSPAN6
1,1,SCYL3
2,2,BAD
3,3,LAP3
4,4,SNX11


In [None]:
# ----------------------- # 
# DEFINE CELL & DRUG DATA #
# ----------------------- #
# gene_tensor = torch.tensor([
#     gene_expr.loc[:, gene_expr.columns != 'CELL_LINE_NAME'].to_numpy(dtype=np.float64)
# ])
# print(f"Gene tensor shape: {gene_tensor.shape}")

# # Cell data: For now only the gene expression information.
# gene_expr_data = gene_tensor[0][:]

# Drug data.
with open(f'{PATH_TO_SAVED_DRUG_FEATURES}drug_name_fingerprints_dataframe.pkl', 'rb') as f:
    drug_name_fps = pickle.load(f)
drug_name_fps_v2 = pd.merge(left=drug_name_fps, 
                            right=gene_expr[['DRUG_ID', 'DRUG_NAME']], 
                            how='left', 
                            left_on=['drug_name'], 
                            right_on=['DRUG_NAME'])
drug_name_fps_v2.drop_duplicates(inplace=True)
drug_name_fps_v2.drop(['DRUG_NAME'], axis=1, inplace=True)                           
print(f"Drug Shape: {drug_name_fps_v2.shape}")
drug_name_fps_v2.head(3)

Drug Shape: (428, 258)


Unnamed: 0,drug_name,0,1,2,3,4,5,6,7,8,...,247,248,249,250,251,252,253,254,255,DRUG_ID
0,(5Z)-7-Oxozeaenol,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,1,0,1242
912,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,179
1825,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1073


- The `DRUG_ID`'s hold as the values of the nodes in the NN.
- 

In [72]:
# For each row of the FULL_DATASET get the corresponding Morgan Fingerprint for the specific drug.
# FULL_DATASET = gene_expr.
FPS = pd.merge(left=gene_expr['DRUG_ID'], 
                right=drug_name_fps_v2.loc[:, ~drug_name_fps_v2.columns.isin(['drug_name'])], 
                how='left', 
                left_on=['DRUG_ID'], 
                right_on=['DRUG_ID'])
print(f"Shape: {FPS.shape}")
FPS.head(5)                

Shape: (446521, 257)


Unnamed: 0,DRUG_ID,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
print(f"with NaN: {gene_expr.shape}")
print(f"w/o  NaN: {gene_expr.dropna().shape}")

with NaN: (446521, 922)
w/o  NaN: (424015, 922)


In [90]:
from typing import List

class ModelDataset(Dataset): 
    def __init__(self, 
                 full_data: pd.DataFrame,
                 cell_cols: List[str],
                 drug_cols: List[str],
                 cell_identifier: str,
                 drug_identifier: str,
                 corresponding_fps: pd.DataFrame = FPS,
                 target_col: str='LN_IC50'): 
        # Drop all nans. 
        print(full_data.shape)
        full_data = full_data.dropna()
        print(full_data.shape)

        X = full_data[cell_cols + drug_cols]
        y = full_data[target_col]

        fps = pd.merge(left=full_data['DRUG_ID'], 
                right=drug_name_fps_v2.loc[:, ~drug_name_fps_v2.columns.isin(['drug_name'])], 
                how='left', 
                left_on=['DRUG_ID'], 
                right_on=['DRUG_ID'])

        X_cell = X[cell_cols]
        X_drug = copy.deepcopy(fps)

        print(f"X_cell.shape: {X_cell.shape}")
        print(f"X_drug.shape: {X_drug.shape}")

        # Contains only the gene columns and the respective feature value. Here GeneExpression.
        self.X = torch.tensor(X.loc[:, ~X.columns.isin([cell_identifier] + ['DRUG_NAME'])].values, dtype=torch.float32)
        self.X_cell = torch.tensor(X_cell.loc[:, ~X_cell.columns.isin([cell_identifier])].values, dtype=torch.float32)
        self.X_drug = torch.tensor(X_drug.loc[:, ~X_drug.columns.isin([drug_identifier])].values, dtype=torch.float32)
        #self.X_cell = torch.tensor(X.loc[:, ~X_cell.columns.isin([cell_identifier])].values, dtype=torch.float32)
        #self.X_drug = torch.tensor(X.loc[:, ~X_drug.columns.isin([drug_identifier])].values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self): 
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.X_cell[idx], self.X_drug[idx], self.y[idx]

In [6]:
print(gene_expr.shape)
gene_expr.loc[:, ~gene_expr.columns.isin(['AUC', 'CELL_ID', 'CONC', 'MASTER_CELL_ID', 'INTENSITY', 'DATASET', 'Z_SCORE', 'COSMIC_ID', 'POSITION'])].head(3)

(446521, 922)


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,LN_IC50,DRUG_NAME,RMSE,TSPAN6,SCYL3,BAD,LAP3,SNX11,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,1,MC-CAR,2.395685,Erlotinib,0.022521,3.238273,4.856061,5.900525,8.120975,6.789716,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
9,1,ES3,3.140923,Erlotinib,0.03184,8.690198,4.572198,6.927127,5.595564,5.0408,...,5.99576,9.337588,7.468226,3.71627,5.363887,6.188079,12.281947,4.794624,3.588528,6.785201
27,1,ES5,3.968757,Erlotinib,0.026052,8.233101,4.749715,7.123143,5.458094,4.598347,...,6.939741,8.688176,7.085349,3.688222,4.572119,6.34509,12.276166,4.114092,5.768098,7.505155


In [91]:
CELL_IDENTIFIER = 'CELL_LINE_NAME'
DRUG_IDENTIFIER = 'DRUG_ID'
TARGET_IDENTIFIER = 'LN_IC50'

uniq_drug_ids = list(np.unique(gene_expr.DRUG_ID.values))

model_dataset = ModelDataset(full_data=gene_expr.sample(2_000),
                             cell_cols=[CELL_IDENTIFIER] + list(MAPPING_FROM_GENES_TO_INDICES.genes.values),
                             drug_cols=[DRUG_IDENTIFIER, 'DRUG_NAME'],
                             cell_identifier=CELL_IDENTIFIER,
                             drug_identifier=DRUG_IDENTIFIER,
                             target_col=TARGET_IDENTIFIER)

'''
TODO: Filter out NaNs
in a tensor do 
    model_dataset.X[~torch.any(model_dataset.X.isnan(),dim=1)]
'''

(2000, 922)
(1900, 922)
X_cell.shape: (1900, 909)
X_drug.shape: (1900, 257)


'\nTODO: Filter out NaNs\nin a tensor do \n    model_dataset.X[~torch.any(model_dataset.X.isnan(),dim=1)]\n'

In [92]:
train_set_split_ratio = 0.8

train_size = int(train_set_split_ratio * len(model_dataset))
test_size = len(model_dataset) - train_size
train_set, test_set = torch.utils.data.random_split(model_dataset, 
                                                    [train_size, test_size],
                                                    generator=torch.Generator().manual_seed(42))

print(f"""
    Shapes 
        Train : 
            X : {train_set.dataset.X.shape}
                X_cell : {train_set.dataset.X_cell.shape}
                X_drug : {train_set.dataset.X_drug.shape}
            y : {train_set.dataset.y.shape}
        Test  :
            X : {test_set.dataset.X.shape}
                X_cell : {test_set.dataset.X_cell.shape}
                X_drug : {test_set.dataset.X_drug.shape}            
            y : {test_set.dataset.y.shape}
""")
# TODO: why does the test set have the same shape?                                                  


    Shapes 
        Train : 
            X : torch.Size([1900, 909])
                X_cell : torch.Size([1900, 908])
                X_drug : torch.Size([1900, 256])
            y : torch.Size([1900])
        Test  :
            X : torch.Size([1900, 909])
                X_cell : torch.Size([1900, 908])
                X_drug : torch.Size([1900, 256])            
            y : torch.Size([1900])



In [93]:
train_set.dataset.X

tensor([[   4.8045,    5.1454,    7.2901,  ...,    5.5002,    4.8260,
         1816.0000],
        [   7.9063,    4.1811,    7.3739,  ...,    9.1237,    6.2141,
         1060.0000],
        [   6.6293,    3.9615,    6.7334,  ...,    6.5634,    6.5646,
         1026.0000],
        ...,
        [   3.4025,    5.5498,    6.5083,  ...,    8.3329,    7.4294,
         1620.0000],
        [   3.1655,    5.5505,    6.7083,  ...,    9.2375,    6.6810,
         1720.0000],
        [   8.3342,    4.6918,    6.0510,  ...,    8.3713,    6.6725,
          574.0000]])

In [94]:
train_set.dataset.X_cell

tensor([[4.8045, 5.1454, 7.2901,  ..., 3.8672, 5.5002, 4.8260],
        [7.9063, 4.1811, 7.3739,  ..., 3.0518, 9.1237, 6.2141],
        [6.6293, 3.9615, 6.7334,  ..., 5.8577, 6.5634, 6.5646],
        ...,
        [3.4025, 5.5498, 6.5083,  ..., 3.5405, 8.3329, 7.4294],
        [3.1655, 5.5505, 6.7083,  ..., 3.9112, 9.2375, 6.6810],
        [8.3342, 4.6918, 6.0510,  ..., 3.2445, 8.3713, 6.6725]])

In [95]:
train_set.dataset.X_drug

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 1., 0.,  ..., 0., 0., 1.]])

__Note__: 
- The very last entry per row is the `DRUG_ID`
- It can be accessed by `model_dataset[<cell-line-number>, -1]`

In [184]:
batch_size = 5

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_set, 
                                           batch_size=batch_size, 
                                           shuffle=True,
                                           num_workers=0)

test_loader = torch.utils.data.DataLoader(dataset=test_set, 
                                          batch_size=batch_size, 
                                          shuffle=True,
                                          num_workers=0) 

In [185]:
count = sum(1 for X, X_cell, X_drug, y in train_loader)
print(f"train batch count : {count}") 

count = sum(1 for X, X_cell, X_drug, y in test_loader)
print(f"test batch count  : {count}") 

train batch count : 304
test batch count  : 76


In [186]:
# NOTE: One drug may have multiple DRUG_ID's.
temp = gene_expr[['DRUG_ID', 'DRUG_NAME']]
np.unique(temp[temp.DRUG_NAME=='5-Fluorouracil'].DRUG_ID.values)

array([ 179, 1073])

In [187]:
drug_name_fps_v2.head(5)

Unnamed: 0,drug_name,0,1,2,3,4,5,6,7,8,...,247,248,249,250,251,252,253,254,255,DRUG_ID
0,(5Z)-7-Oxozeaenol,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,1,0,1242
912,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,179
1825,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1073
2631,A-443654,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,86
3030,A-770041,1,1,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,55


In [188]:
drug_df_temp = drug_name_fps_v2[drug_name_fps_v2.DRUG_ID.isin([179, 1073])]
drug_df_temp.loc[:, ~drug_df_temp.columns.isin(['drug_name', 'DRUG_ID'])]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [189]:
for i, (X_batch, X_cell, X_drug, y_batch) in tqdm(enumerate(train_loader)):
    print(X_batch[0][-1])
    assert X_batch[0][0] == X_cell[0][0]
    print(X_cell)
    print(X_drug)
    print(y_batch[0])
    break

0it [00:00, ?it/s]

tensor(309.)
tensor([[8.2467, 3.8549, 6.6392,  ..., 4.7128, 6.6924, 5.9467],
        [7.1250, 4.8578, 7.1381,  ..., 3.5800, 3.2323, 7.1039],
        [7.4219, 3.6908, 7.2087,  ..., 6.0082, 7.1204, 5.7271],
        [3.3843, 4.2569, 6.3539,  ..., 4.4878, 8.2691, 6.1728],
        [8.0526, 3.6786, 7.1633,  ..., 3.5155, 8.0239, 6.7621]])
tensor([[0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor(5.2466)





In [190]:
%load_ext autoreload
%autoreload 24

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [285]:
# TODO: some drug rows (fingerprints) are NaN. REMOVE THESE ROWS!!!
# Options to prevent exploding gradients:
# TODO: normalize in the FC layers.
# TODO: lower the learning rate.

In [283]:
%load_ext autoreload
%autoreload 24

from v1_NormalNN import Model
from tqdm import tqdm
import copy

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# -------- #
# TRAINING #
# -------- #
def train_model(model, criterion, optimizer, num_epochs=100):  
    start = time.time()
    loss_values = []
    total_step = len(train_loader)
    it = iter(train_loader)

    for epoch in range(num_epochs): 
        running_loss_train = 0.0
        model.train()
        for i, (X_batch, X_batch_cell, X_batch_drug, y_batch) in tqdm(enumerate(train_loader)):
            
            # X_batch_cell, X_batch_drug
            # X_cell = X_batch_cell.to(device)
            # X_drug = X_batch_drug.to(device)
            X = X_batch.to(device)
            X_cell = X_batch_cell.to(device)
            X_drug = X_batch_drug.to(device)            
            y = y_batch.to(device)

            # X_cell = X_batch[:, :-1].to(device) # Only the genes.
            # X_drug = X_batch[:, -1].to(device) # The DRUG_ID's are in the last entry per row.

            # assert not torch.isnan(X_cell).any(), f"X_cell has NaN in it {i}."
            # assert not torch.isnan(X_drug).any(), f"X_drug has NaN in it {i}."
            # TODO: remove the rows with NaNs.
            # assert not torch.isnan(X_batch).any(), f"X_batch has NaN in it {i}."
            # assert len(X_drug) == batch_size, f"ERROR: len(X_drug) = {len(X_drug)} != {batch_size} = batch_size."
            # assert not torch.isnan(y).any(), f"y has NaN in it {i}."  

            # print(X_cell)  
            # print(len(X_cell[0]))
            # print(X_drug)  
            # print(len(X_drug))
            # print(X_cell) 
            # print("BEFORE")
            # print(f"  X_cell.shape: {X_cell.shape}")
            # print(f"  X_drug.shape: {X_drug.shape}")
            # print("AFTER")
            # print(f"  X_cell.shape: {X_cell.reshape(X_cell.shape[0], 1, X_cell.shape[1]).shape}")
            # print(f"  X_drug.shape: {X_drug.reshape(1, X_drug.shape[0], 1).shape}")
            # print(X_cell.reshape(X_cell.shape[0], 1, X_cell.shape[1]).shape)
            
            # Add the fingerprints to the specific drugs.
            # drug_df_temp = drug_name_fps_v2[drug_name_fps_v2.DRUG_ID.isin(X_drug.tolist())]
            # drug_df_temp = drug_df_temp.loc[:, ~drug_df_temp.columns.isin(['drug_name', 'DRUG_ID'])]
            
            # y_preds = model(X_cell.reshape(X_cell.shape[0], 1, X_cell.shape[1]), X_drug.reshape(X_drug.shape[0], 1, 1))

            print("SHAPES BEFORE")
            print(X_cell.shape) 
            print(X_drug.shape) 
            y_preds = model(X_cell.reshape(X_cell.shape[0], 1, X_cell.shape[1]), 
                            X_drug.reshape(X_drug.shape[0], 1, X_drug.shape[1]))
            print(f"PREDICTIONS\n{100*'='}")
            for i, y_pred in enumerate(y_preds): 
                print(y_pred.shape)
                print(y_pred)
                #print(f"  {i:10.0f} pred : {y_pred:5.2f}")
            print(f"y_preds  : {y_preds}")
            print(f"y_actual : {y}")
            loss = loss_func(y_preds, y)

            print(f"Loss: {loss}")

            running_loss_train += loss.item()

            print(f"Running Loss: {running_loss_train}")

            # Backward and optimize.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print("Optimized!")

            if (i+1) % 1 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'\
                    .format(epoch+1, num_epochs, i+1, total_step, loss.item()))             
                break
            
        loss_values.append(running_loss_train / len(gene_expr))      

# ---------------- # 
# Hyper-parameters #
# ---------------- #
input_size = 910 # which is gene_expr_base.shape[0]-1 = 911 - 1
                 # One value per feature
hidden_size = 100
num_classes = 1
num_epochs = 2
batch_size = batch_size
learning_rate = 0.001                

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Device: cpu


In [284]:
%load_ext autoreload
%autoreload 24
from v1_NormalNN import Model

#learning_rate = 0.01

n_out = np.floor((908-10)/1) + 1
print(n_out)

model = Model().to(device)

loss_func = nn.MSELoss() # = criterion
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

train_model(model=model,
            criterion=loss_func,
            optimizer=optimizer,
            num_epochs=2)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
899.0

CALLED: def __init__(self)
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
self.cell_branch
Sequential(
  (0): Linear(in_features=908, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=128, bias=True)
  (3): ReLU()
)
----------------------------------------------------------------------------------------------------
self.drug_branch
Sequential(
  (0): Linear(in_features=256, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=128, bias=True)
  (3): ReLU()
)
----------------------------------------------------------------------------------------------------

self.fcn
Sequential(
  (0): Linear(in_features=256, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_fea

0it [00:00, ?it/s]


SHAPES BEFORE
torch.Size([5, 908])
torch.Size([5, 256])

CALLED: forward(self, cell, drug)
tensor(0)
tensor(0)

Cell line INPUT shape: torch.Size([5, 1, 908])
Cell line OUTPUT shape: torch.Size([5, 1, 128])


Drug INPUT shape: torch.Size([5, 1, 256])
tensor([[[1., 0., 0.,  ..., 0., 0., 1.]],

        [[0., 0., 1.,  ..., 0., 0., 0.]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[1., 0., 1.,  ..., 0., 0., 0.]]])
tensor(512)
tensor(0)
Drug OUTPUT shape: torch.Size([5, 1, 256])



SUMMARY
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     out_cell.shape: torch.Size([5, 1, 128])
     out_drug.shape: torch.Size([5, 1, 128])
Before reshaping --> concat.shape: torch.Size([5, 2, 128])
After reshaping --> concat.shape: torch.Size([5, 256])
Shape of y_pred : torch.Size([5, 1])
tensor([[0.0578],
        [0.0397],
        [   nan],
        [   nan],
        [0.0474]], grad_fn=<Relu

0it [00:00, ?it/s]

SHAPES BEFORE
torch.Size([5, 908])
torch.Size([5, 256])

CALLED: forward(self, cell, drug)
tensor(0)
tensor(0)

Cell line INPUT shape: torch.Size([5, 1, 908])
Cell line OUTPUT shape: torch.Size([5, 1, 128])


Drug INPUT shape: torch.Size([5, 1, 256])
tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 1.]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[0., 1., 0.,  ..., 0., 1., 1.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])
tensor(256)
tensor(0)
Drug OUTPUT shape: torch.Size([5, 1, 256])



SUMMARY
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     out_cell.shape: torch.Size([5, 1, 128])
     out_drug.shape: torch.Size([5, 1, 128])
Before reshaping --> concat.shape: torch.Size([5, 2, 128])
After reshaping --> concat.shape: torch.Size([5, 256])
Shape of y_pred : torch.Size([5, 1])
tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan]], grad_fn=<ReluBackward0>)
Shape of 




In [264]:
x1 = torch.randn(5,1)
print(x1.shape)
print(x1.reshape(5).shape)

torch.Size([5, 1])
torch.Size([5])


In [231]:
x = torch.randn(5, 2, 128)
x.shape

# goal is shape [5, 1, 256]

torch.Size([5, 2, 128])

In [240]:
x[0].squeeze(1).shape

torch.Size([2, 128])

In [250]:
torch.cat([x[0][0], x[0][1]]).shape

torch.Size([256])

In [252]:
x.reshape(5, 1, 256)

tensor([[[ 0.6801,  0.5911,  0.1787,  ..., -0.1986, -0.2094, -1.2084]],

        [[ 0.1185, -1.5728,  0.7682,  ...,  0.8380, -0.0048,  0.3707]],

        [[ 0.1387,  0.5068, -0.5284,  ...,  0.4643, -1.5558,  0.7886]],

        [[-1.0692, -0.2869, -2.9320,  ..., -1.1395,  0.6468,  1.1936]],

        [[ 0.1241, -0.6243,  0.0577,  ..., -1.1470, -1.0864,  0.4662]]])

In [253]:
x[0][1][-3:]

tensor([-0.1986, -0.2094, -1.2084])

In [249]:
x[0][0].shape

torch.Size([128])

In [170]:
x = torch.tensor([[1,2,3], [4,5,6]])
x

tensor([[1, 2, 3],
        [4, 5, 6]])

In [175]:
x.reshape((2,1,3))

tensor([[[1, 2, 3]],

        [[4, 5, 6]]])

In [167]:
x.permute(1, 0, 2)

tensor([[[1, 2, 3]],

        [[4, 5, 6]]])

In [161]:
torch.randn(1,2,3)

tensor([[[ 1.5532, -0.3449, -0.3381],
         [-0.6890,  0.0881, -0.3231]]])

In [202]:
gene_expr_data

tensor([[7.4722, 5.8678, 4.9159,  ..., 6.6141, 8.5520, 3.2697],
        [7.4081, 6.1157, 6.0675,  ..., 3.6499, 6.8504, 3.5596],
        [7.2114, 7.4453, 5.6951,  ..., 3.6498, 8.3988, 3.8715],
        ...,
        [7.0449, 6.8248, 4.8239,  ..., 4.6832, 9.6526, 3.7312],
        [7.1434, 6.8708, 6.0484,  ..., 7.9259, 8.3798, 3.4813],
        [7.5782, 7.3208, 5.1952,  ..., 7.5937, 7.9628, 3.4822]],
       dtype=torch.float64)

In [132]:
np1 = np.array([[1], 
                [2]])
np2 = np.array([[3], 
                [4]])
np3 = np.array([[5], 
                [6]])

"""
1.dim = features
2.dim = cell-lines
3.dim = genes
"""

t = torch.tensor([np1, np2, np3])
print(t.shape)
t

torch.Size([3, 2, 1])


tensor([[[1],
         [2]],

        [[3],
         [4]],

        [[5],
         [6]]])

In [138]:
t2 = torch.flatten(t, start_dim=1)
t2

tensor([[1, 2],
        [3, 4],
        [5, 6]])

In [154]:
np.reshape(t, (2, 3), order='F')

tensor([[1, 5, 4],
        [3, 2, 6]])

In [156]:
np.ravel(t, order='F').reshape((2, t.shape[0] * t.shape[2]))

array([[1, 3, 5],
       [2, 4, 6]])

---

## Drug Features

In [27]:
with open(f'{PATH_TO_SAVED_DRUG_FEATURES}drug_name_fingerprints_dataframe.pkl', 'rb') as f:
    drug_name_fps = pickle.load(f)
print(f"Shape: {drug_name_fps.shape}")
drug_name_fps.head(3)   

Shape: (367, 257)


Unnamed: 0,drug_name,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,(5Z)-7-Oxozeaenol,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
1,5-Fluorouracil,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A-443654,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


The encoded drug feature dataset has 367 unique drug names and per drug name a 256 sized De-Morgan fingerprint.