In [10]:
%pwd

'/Users/cwoest/Documents/Academics/Data_Science_UP/master_thesis/material/GNN-material/notebooks'

In [1]:
import torch
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set_theme(style="white")

import sys
import os
# Make sure everything is on the path.
sys.path.insert(1, os.path.join(sys.path[0], '../src'))

from src.preprocess.build_drug_response_matrix import (
    get_gdsc_gene_expression,
    cosmic_ids_to_cell_line_names
)
from src.utils.file_helper import (
    compress_pickle,
    decompress_pickle
)

In [2]:
%conda --version

conda 4.10.3

Note: you may need to restart the kernel to use updated packages.


In this notebook we are going to create all the datasets.

- `../../datasets/gdsc/screening_data/GDSC2_fitted_dose_response_25Feb20.xlsx`
- `../../datasets/gdsc/screening_data/GDSC1_fitted_dose_response_25Feb20.xlsx`
- `../../datasets/gdsc/screening_data/GDSC1_public_raw_data_25Feb20.csv`
- `../../datasets/gdsc/screening_data/GDSC2_public_raw_data_25Feb20.csv`

In [2]:
RAW_DATA_PATH = '../data/raw/'
PROCESSED_DATA_PATH = '../data/processed/'

# Drug response matrix

In [3]:
PATH_TO_GDSC_SCREENING_DATA = '../../../datasets/gdsc/screening_data/'

## IC50's

In [4]:
GDSC1_IC50_FILE = 'GDSC1_fitted_dose_response_25Feb20.xlsx'
GDSC2_IC50_FILE = 'GDSC2_fitted_dose_response_25Feb20.xlsx'

# Read the IC50 files.
# GDSC1
start = time.time()
gdsc1_ic50s = pd.read_excel(f'{PATH_TO_GDSC_SCREENING_DATA}{GDSC1_IC50_FILE}', header=0)
print(f"File `{GDSC1_IC50_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc1_ic50s.shape}")

# GDSC2
start = time.time()
gdsc2_ic50s = pd.read_excel(f'{PATH_TO_GDSC_SCREENING_DATA}{GDSC2_IC50_FILE}', header=0)
print(f"File `{GDSC2_IC50_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc2_ic50s.shape}")

File `GDSC1_fitted_dose_response_25Feb20.xlsx` took 96.41459 seconds to import. It has shape (310904, 19)
File `GDSC2_fitted_dose_response_25Feb20.xlsx` took 38.82771 seconds to import. It has shape (135242, 19)


In [5]:
print(gdsc1_ic50s.head(3).to_markdown())
print()
print(gdsc2_ic50s.head(3).to_markdown())

|    | DATASET   |   NLME_RESULT_ID |   NLME_CURVE_ID |   COSMIC_ID | CELL_LINE_NAME   | SANGER_MODEL_ID   | TCGA_DESC    |   DRUG_ID | DRUG_NAME   | PUTATIVE_TARGET   | PATHWAY_NAME   |   COMPANY_ID | WEBRELEASE   |   MIN_CONC |   MAX_CONC |   LN_IC50 |      AUC |     RMSE |   Z_SCORE |
|---:|:----------|-----------------:|----------------:|------------:|:-----------------|:------------------|:-------------|----------:|:------------|:------------------|:---------------|-------------:|:-------------|-----------:|-----------:|----------:|---------:|---------:|----------:|
|  0 | GDSC1     |              281 |        12974350 |      683665 | MC-CAR           | SIDM00636         | MM           |         1 | Erlotinib   | EGFR              | EGFR signaling |         1045 | Y            |   0.007813 |          2 |   2.39568 | 0.982114 | 0.022521 | -0.189576 |
|  1 | GDSC1     |              281 |        12975300 |      684055 | ES3              | SIDM00265         | UNCLASSIFIED |         1

In [6]:
# Join both datasets for analysis purposes.
gdsc_ic50s_join = pd.concat([gdsc1_ic50s, gdsc2_ic50s], ignore_index=True)
print(gdsc_ic50s_join.shape)
assert gdsc_ic50s_join[gdsc_ic50s_join.index.duplicated()].shape[0] == 0
assert gdsc_ic50s_join.shape[0] == gdsc1_ic50s.shape[0] + gdsc2_ic50s.shape[0]

(446146, 19)


## Raw data

In [6]:
GDSC1_RAW_FILE = 'GDSC1_public_raw_data_25Feb20.csv'
GDSC2_RAW_FILE = 'GDSC2_public_raw_data_25Feb20.csv'

# Read the raw files.
# GDSC1
start = time.time()
gdsc1_raw = pd.read_csv(f'{PATH_TO_GDSC_SCREENING_DATA}{GDSC1_RAW_FILE}', header=0)
print(f"File `{GDSC1_RAW_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc1_raw.shape}")

# GDSC2
start = time.time()
gdsc2_raw = pd.read_csv(f'{PATH_TO_GDSC_SCREENING_DATA}{GDSC2_RAW_FILE}', header=0)
print(f"File `{GDSC2_RAW_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc2_raw.shape}")

  gdsc1_raw = pd.read_csv(f'{PATH_TO_GDSC_SCREENING_DATA}{GDSC1_RAW_FILE}', header=0)


File `GDSC1_public_raw_data_25Feb20.csv` took 16.18649 seconds to import. It has shape (5837703, 18)
File `GDSC2_public_raw_data_25Feb20.csv` took 18.13408 seconds to import. It has shape (6646430, 18)


In [21]:
print(gdsc1_raw.head(3).to_markdown())
print()
print(gdsc2_raw.head(3).to_markdown())

|    | RESEARCH_PROJECT   |   BARCODE |   SCAN_ID | DATE_CREATED         |   SCAN_DATE |   CELL_ID |   MASTER_CELL_ID |   COSMIC_ID | CELL_LINE_NAME   |   SEEDING_DENSITY | DRUGSET_ID   | ASSAY   |   DURATION |   POSITION | TAG     |   DRUG_ID |   CONC |   INTENSITY |
|---:|:-------------------|----------:|----------:|:---------------------|------------:|----------:|-----------------:|------------:|:-----------------|------------------:|:-------------|:--------|-----------:|-----------:|:--------|----------:|-------:|------------:|
|  0 | Sanger_GDSC1       |    100541 |      1765 | 2010-04-18T23:00:00Z |         nan |      2415 |              365 |      924238 | K5               |               250 | 505_a_5      | a       |          3 |          1 | B       |       nan |    nan |       26022 |
|  1 | Sanger_GDSC1       |    100541 |      1765 | 2010-04-18T23:00:00Z |         nan |      2415 |              365 |      924238 | K5               |               250 | 505_a_5      | a    

In [7]:
# Join both datasets for analysis purposes.
gdsc_raw_join = pd.concat([gdsc1_raw, gdsc2_raw], ignore_index=True)
print(gdsc_raw_join.shape)
assert gdsc_raw_join[gdsc_raw_join.index.duplicated()].shape[0] == 0
assert gdsc_raw_join.shape[0] == gdsc1_raw.shape[0] + gdsc2_raw.shape[0]

(12484133, 18)


## Preprocess

In [7]:
print(gdsc_ic50s_join.shape)
print(gdsc_ic50s_join.columns)
gdsc_ic50s_join.head(3)

(446146, 19)
Index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE'],
      dtype='object')


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,281,12974350,683665,MC-CAR,SIDM00636,MM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.395685,0.982114,0.022521,-0.189576
1,GDSC1,281,12975300,684055,ES3,SIDM00265,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.140923,0.984816,0.03184,0.508635
2,GDSC1,281,12975647,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.968757,0.985693,0.026052,1.284229


In [8]:
COLS_TO_KEEP = ['DATASET', 'CELL_LINE_NAME', 'DRUG_NAME', 'DRUG_ID', 'AUC', 'RMSE', 'Z_SCORE', 'LN_IC50']
gdsc_base = gdsc_ic50s_join[COLS_TO_KEEP].drop_duplicates()
print(gdsc_base.shape)
# Percent of NaN values per column
print("Missing rates:")
print("==============")
print(100 * gdsc_base.isna().sum() / gdsc_base.shape[0])
gdsc_base.head(3)

(446146, 8)
Missing rates:
DATASET           0.0
CELL_LINE_NAME    0.0
DRUG_NAME         0.0
DRUG_ID           0.0
AUC               0.0
RMSE              0.0
Z_SCORE           0.0
LN_IC50           0.0
dtype: float64


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,AUC,RMSE,Z_SCORE,LN_IC50
0,GDSC1,MC-CAR,Erlotinib,1,0.982114,0.022521,-0.189576,2.395685
1,GDSC1,ES3,Erlotinib,1,0.984816,0.03184,0.508635,3.140923
2,GDSC1,ES5,Erlotinib,1,0.985693,0.026052,1.284229,3.968757


In [9]:
gdsc_base.to_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Features

## Gene expression

In [11]:
cl_details = pd.read_excel(RAW_DATA_PATH + 'Cell_Lines_Details.xlsx')
print(cl_details.shape)
print(cl_details.head(3).to_markdown())

(1002, 13)
|    | Sample Name   |   COSMIC identifier | Whole Exome Sequencing (WES)   | Copy Number Alterations (CNA)   | Gene Expression   | Methylation   | Drug       | GDSC                  | GDSC           | Cancer Type             | Microsatellite             | Screen Medium   | Growth Properties   |
|    |               |                     |                                |                                 |                   |               | Response   | Tissue descriptor 1   | Tissue         | (matching TCGA label)   | instability Status (MSI)   |                 |                     |
|    |               |                     |                                |                                 |                   |               |            |                       | descriptor 2   |                         |                            |                 |                     |
|---:|:--------------|--------------------:|:-------------------------------|:--------------------

  warn(msg)


In [12]:
gene_expression = pd.read_csv(RAW_DATA_PATH + 'Cell_line_RMA_proc_basalExp.txt', sep="\t")
print(gene_expression.shape)
print(gene_expression.iloc[:2, :5].to_markdown())

(17737, 1020)
|    | GENE_SYMBOLS   | GENE_title                                   |   DATA.906826 |   DATA.687983 |   DATA.910927 |
|---:|:---------------|:---------------------------------------------|--------------:|--------------:|--------------:|
|  0 | TSPAN6         | tetraspanin 6 [Source:HGNC Symbol;Acc:11858] |       7.63202 |       7.54867 |       8.71234 |
|  1 | TNMD           | tenomodulin [Source:HGNC Symbol;Acc:17757]   |       2.96459 |       2.77772 |       2.64351 |


In [13]:
# from src.preprocess.build_drug_response_matrix import (
#     get_gdsc_gene_expression, cosmic_ids_to_cell_line_names
# )

CELL_LINE_DETAILS_FILE = 'Cell_Lines_Details.xlsx'
GENE_EXPRESSION_RAW_FILE = 'Cell_line_RMA_proc_basalExp.txt'

# Return the gene expression dataframe(n_cells x n_genes) for a set of gene symbols for all cell_lines of the GDSC cell line annotation file.
# If the genes are None, return the data for all genes.
gene_expr = get_gdsc_gene_expression(
    path_cell_annotations=RAW_DATA_PATH + CELL_LINE_DETAILS_FILE,
    path_gene_expression=RAW_DATA_PATH + GENE_EXPRESSION_RAW_FILE 
)
print(gene_expr.shape)
gene_expr.head(3)

Note: 50 Cosmic IDs not found in cell annotation data: 
['906815', '1330955', '907284', '1330944', '1330943', '1298232', '1299076', '1299051', '906829', '1290773', '907049', '909730', '910946', '907391', '687509', '1327761', '753536', '1240152', '925342', '687814', '1479994', '1479991', '1298355', '1331026', '1298150', '1479993', '11223344', '1330945', '1479992', '998179', '1331060', '1331028', '1503362.1', '906850', '1240211', '1299049', '908119', '1331031', '907785', '1330983.1', '906867', '1723793', '1723794', '1240156', '1299067', '906803', '1298154', '909976.1', '1659787', '905954.1']
(1018, 17737)


  warn(msg)


Sample Name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,LINC00526,PPY2,nan,nan.1,KRT18P55,nan.2,POLRMTP1,UBL5P2,TBC1D3P5,nan.3
CAL-120,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,5.369039,...,6.786925,2.997054,3.109774,7.882377,3.331134,2.852537,3.130696,9.986616,3.073724,7.284733
DMS-114,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,7.209653,...,5.317911,3.263745,3.059424,8.681302,2.992611,2.776771,3.260982,9.002814,3.000182,8.504804
CAL-51,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,5.120747,...,3.143006,3.112145,2.930254,8.707886,2.886574,2.685307,3.176239,9.113243,2.916274,7.059092


In [14]:
LANDMARK_GENES_FILE = 'landmark_genes.csv' 

landmark_genes = pd.read_csv(RAW_DATA_PATH + LANDMARK_GENES_FILE, sep="\t")
print(landmark_genes.shape)
print(landmark_genes.head(3).to_markdown())


(978, 7)
|    |   Entrez ID | Symbol   | Name                   | Gene Family                | Type     |   RNA-Seq Correlation |   RNA-Seq Correlation Self-Rank |
|---:|------------:|:---------|:-----------------------|:---------------------------|:---------|----------------------:|--------------------------------:|
|  0 |        3638 | INSIG1   | insulin induced gene 1 | nan                        | landmark |                   nan |                             nan |
|  1 |        2309 | FOXO3    | forkhead box O3        | Forkhead boxes             | landmark |                   nan |                             nan |
|  2 |        1001 | CDH3     | cadherin 3             | Type I classical cadherins | landmark |                   nan |                             nan |


In [15]:
# Choose only the cell-line columns of the gene expressions table that are in the landmark gene file.
inter_cols = list(set(gene_expr.columns).intersection(set(landmark_genes.Symbol)))
gene_expr_sparse = gene_expr[inter_cols]
gene_expr_sparse.columns.rename('CELL_LINE_NAME', inplace=True)
print(gene_expr_sparse.shape)
gene_expr_sparse.head(3)

(1018, 908)


CELL_LINE_NAME,PWP1,GFPT1,PAFAH1B1,CYTH1,PLK1,MRPL19,NCOA3,SNX11,MVP,PYGL,...,MYL9,FRS2,BACE2,SLC1A4,TEX10,PXN,FYN,ATMIN,ZNF318,FAH
CAL-120,8.54545,7.748112,7.778578,4.378528,3.853142,9.854163,5.715514,4.767457,7.524185,8.526754,...,6.713753,8.174987,7.793436,3.806608,7.107863,4.330679,7.225757,7.091893,4.877458,5.423768
DMS-114,9.541265,6.838576,8.314279,4.361999,4.218736,10.905396,4.902592,5.246301,3.831505,9.215297,...,5.519753,4.901783,4.981265,3.670515,6.469308,3.635382,5.784534,8.208162,7.561376,5.086421
CAL-51,8.450964,7.699202,7.624126,4.032946,3.694307,9.996235,5.405683,4.6458,5.912042,9.59329,...,7.639868,4.456291,3.747179,5.145114,6.600039,4.071145,5.551893,8.019902,5.761437,5.633228


In [16]:
cols_to_join_on = ['CELL_LINE_NAME']
join_gdsc_geneexpr = gdsc_base.merge(right=gene_expr_sparse,
                                     left_on=['CELL_LINE_NAME'],
                                     right_index=True,
                                     how='left',
                                     suffixes=['_gdsc', '_geneexpr'])
print(join_gdsc_geneexpr.shape)
join_gdsc_geneexpr.head(3)

(446146, 916)


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,AUC,RMSE,Z_SCORE,LN_IC50,PWP1,GFPT1,...,MYL9,FRS2,BACE2,SLC1A4,TEX10,PXN,FYN,ATMIN,ZNF318,FAH
0,GDSC1,MC-CAR,Erlotinib,1,0.982114,0.022521,-0.189576,2.395685,9.141991,7.037431,...,4.243679,3.639442,3.773685,5.512912,7.081855,3.202161,7.904451,7.88722,5.284856,4.795894
1,GDSC1,ES3,Erlotinib,1,0.984816,0.03184,0.508635,3.140923,9.529654,8.758251,...,3.497337,4.917604,5.607234,5.294418,7.174595,3.688081,8.867842,8.525991,5.736538,4.8542
2,GDSC1,ES5,Erlotinib,1,0.985693,0.026052,1.284229,3.968757,8.582288,8.474621,...,3.095744,4.694516,6.901761,6.730694,7.215499,3.851262,7.698497,7.38365,5.474494,5.038323


In [None]:
# Save the GDSC table with the gene expression information to a file.
join_gdsc_geneexpr.to_pickle(PROCESSED_DATA_PATH + 'gexpr_full.pkl')

## Copy number variation

## Mutation