In [1]:
%pwd

'/Users/cwoest/Documents/Academics/Data_Science_UP/master_thesis/material/GNN-material/notebooks'

In [2]:
import torch
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set_theme(style="white")

import sys
import os
# Make sure everything is on the path.
sys.path.insert(1, os.path.join(sys.path[0], '../src'))

from src.preprocess.build_drug_response_matrix import (
    get_gdsc_gene_expression,
    cosmic_ids_to_cell_line_names
)

In [2]:
%conda --version

conda 4.10.3

Note: you may need to restart the kernel to use updated packages.


In this notebook we are going to create all the datasets.

Datasets used for the __drug response matrix__ (`../data/processed/drm_full.pkl`):
- `../data/raw/GDSC2_fitted_dose_response_25Feb20.xlsx`
- `../data/raw/GDSC1_fitted_dose_response_25Feb20.xlsx`
- `../data/raw/GDSC1_public_raw_data_25Feb20.csv`
- `../data/raw/GDSC2_public_raw_data_25Feb20.csv`

Datasets used for the __gene expression dataset__ (`../data/processed/gexpr_full.pkl`):
- `../data/raw/Cell_line_RMA_proc_basalExp.txt`
- `../data/raw/Cell_Lines_Details.xlsx`
- `../data/raw/landmark_genes.csv`
- `../data/processed/drm_full.pkl`

Datasets used for the __copy number picnic dataset__ (`../data/processed/cnvp_full.pkl`): 
- `../data/raw/cnv_abs_copy_number_picnic_20191101.csv`
- `../data/raw/landmark_genes.csv`
- `../data/processed/drm_full.pkl`

Datasets used for the __copy number gistic dataset__ (`../data/processed/cnvg_full.pkl`): 
- `../data/raw/cnv_gistic_20191101.csv`
- `../data/raw/landmark_genes.csv`
- `../data/processed/drm_full.pkl`


Datasets used for the __mutation dataset__ (`../data/processed/mut_full.pkl`): 
- `../data/raw/mutations_all_20220315.zip`
- `../data/raw/landmark_genes.csv`
- `../data/processed/drm_full.pkl`


In [3]:
RAW_DATA_PATH = '../../data/raw/'
PROCESSED_DATA_PATH = '../../data/processed/'

# Drug response matrix

## IC50's

In [55]:
GDSC1_IC50_FILE = 'GDSC1_fitted_dose_response_25Feb20.xlsx'
GDSC2_IC50_FILE = 'GDSC2_fitted_dose_response_25Feb20.xlsx'

# Read the IC50 files.
# GDSC1
start = time.time()
gdsc1_ic50s = pd.read_excel(f'{RAW_DATA_PATH}{GDSC1_IC50_FILE}', header=0)
print(f"File `{GDSC1_IC50_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc1_ic50s.shape}")

# GDSC2
start = time.time()
gdsc2_ic50s = pd.read_excel(f'{RAW_DATA_PATH}{GDSC2_IC50_FILE}', header=0)
print(f"File `{GDSC2_IC50_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc2_ic50s.shape}")

File `GDSC1_fitted_dose_response_25Feb20.xlsx` took 84.06795 seconds to import. It has shape (310904, 19)
File `GDSC2_fitted_dose_response_25Feb20.xlsx` took 36.50131 seconds to import. It has shape (135242, 19)


In [56]:
gdsc1_ic50s.head(3)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,281,12974350,683665,MC-CAR,SIDM00636,MM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.395685,0.982114,0.022521,-0.189576
1,GDSC1,281,12975300,684055,ES3,SIDM00265,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.140923,0.984816,0.03184,0.508635
2,GDSC1,281,12975647,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.968757,0.985693,0.026052,1.284229


In [5]:
print(gdsc1_ic50s.head(3).to_markdown())
print()
print(gdsc2_ic50s.head(3).to_markdown())

|    | DATASET   |   NLME_RESULT_ID |   NLME_CURVE_ID |   COSMIC_ID | CELL_LINE_NAME   | SANGER_MODEL_ID   | TCGA_DESC    |   DRUG_ID | DRUG_NAME   | PUTATIVE_TARGET   | PATHWAY_NAME   |   COMPANY_ID | WEBRELEASE   |   MIN_CONC |   MAX_CONC |   LN_IC50 |      AUC |     RMSE |   Z_SCORE |
|---:|:----------|-----------------:|----------------:|------------:|:-----------------|:------------------|:-------------|----------:|:------------|:------------------|:---------------|-------------:|:-------------|-----------:|-----------:|----------:|---------:|---------:|----------:|
|  0 | GDSC1     |              281 |        12974350 |      683665 | MC-CAR           | SIDM00636         | MM           |         1 | Erlotinib   | EGFR              | EGFR signaling |         1045 | Y            |   0.007813 |          2 |   2.39568 | 0.982114 | 0.022521 | -0.189576 |
|  1 | GDSC1     |              281 |        12975300 |      684055 | ES3              | SIDM00265         | UNCLASSIFIED |         1

In [57]:
# Join both datasets for analysis purposes.
gdsc_ic50s_join = pd.concat([gdsc1_ic50s, gdsc2_ic50s], ignore_index=True)
print(gdsc_ic50s_join.shape)
assert gdsc_ic50s_join[gdsc_ic50s_join.index.duplicated()].shape[0] == 0
assert gdsc_ic50s_join.shape[0] == gdsc1_ic50s.shape[0] + gdsc2_ic50s.shape[0]

(446146, 19)


## Raw data

In [7]:
GDSC1_RAW_FILE = 'GDSC1_public_raw_data_25Feb20.csv'
GDSC2_RAW_FILE = 'GDSC2_public_raw_data_25Feb20.csv'

# Read the raw files.
# GDSC1
start = time.time()
gdsc1_raw = pd.read_csv(f'{RAW_DATA_PATH}{GDSC1_RAW_FILE}', header=0)
print(f"File `{GDSC1_RAW_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc1_raw.shape}")

# GDSC2
start = time.time()
gdsc2_raw = pd.read_csv(f'{RAW_DATA_PATH}{GDSC2_RAW_FILE}', header=0)
print(f"File `{GDSC2_RAW_FILE}` took {time.time()-start:.5f} seconds to import. It has shape {gdsc2_raw.shape}")

  gdsc1_raw = pd.read_csv(f'{RAW_DATA_PATH}{GDSC1_RAW_FILE}', header=0)


File `GDSC1_public_raw_data_25Feb20.csv` took 14.88413 seconds to import. It has shape (5837703, 18)
File `GDSC2_public_raw_data_25Feb20.csv` took 15.84662 seconds to import. It has shape (6646430, 18)


In [8]:
print(gdsc1_raw.head(3).to_markdown())
print()
print(gdsc2_raw.head(3).to_markdown())

|    | RESEARCH_PROJECT   |   BARCODE |   SCAN_ID | DATE_CREATED         |   SCAN_DATE |   CELL_ID |   MASTER_CELL_ID |   COSMIC_ID | CELL_LINE_NAME   |   SEEDING_DENSITY | DRUGSET_ID   | ASSAY   |   DURATION |   POSITION | TAG     |   DRUG_ID |   CONC |   INTENSITY |
|---:|:-------------------|----------:|----------:|:---------------------|------------:|----------:|-----------------:|------------:|:-----------------|------------------:|:-------------|:--------|-----------:|-----------:|:--------|----------:|-------:|------------:|
|  0 | Sanger_GDSC1       |    100541 |      1765 | 2010-04-18T23:00:00Z |         nan |      2415 |              365 |      924238 | K5               |               250 | 505_a_5      | a       |          3 |          1 | B       |       nan |    nan |       26022 |
|  1 | Sanger_GDSC1       |    100541 |      1765 | 2010-04-18T23:00:00Z |         nan |      2415 |              365 |      924238 | K5               |               250 | 505_a_5      | a    

In [9]:
# Join both datasets for analysis purposes.
gdsc_raw_join = pd.concat([gdsc1_raw, gdsc2_raw], ignore_index=True)
print(gdsc_raw_join.shape)
assert gdsc_raw_join[gdsc_raw_join.index.duplicated()].shape[0] == 0
assert gdsc_raw_join.shape[0] == gdsc1_raw.shape[0] + gdsc2_raw.shape[0]

(12484133, 18)


## Preprocess

In [58]:
print(gdsc_ic50s_join.shape)
print(gdsc_ic50s_join.columns)
gdsc_ic50s_join.head(3)

(446146, 19)
Index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE'],
      dtype='object')


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,281,12974350,683665,MC-CAR,SIDM00636,MM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.395685,0.982114,0.022521,-0.189576
1,GDSC1,281,12975300,684055,ES3,SIDM00265,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.140923,0.984816,0.03184,0.508635
2,GDSC1,281,12975647,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.968757,0.985693,0.026052,1.284229


In [59]:
COLS_TO_KEEP = ['DATASET', 'CELL_LINE_NAME', 'DRUG_NAME', 'DRUG_ID', 
                'SANGER_MODEL_ID', 'AUC', 'RMSE', 'Z_SCORE', 'LN_IC50']
gdsc_base = gdsc_ic50s_join[COLS_TO_KEEP].drop_duplicates()
print(gdsc_base.shape)
# Percent of NaN values per column.
print("Missing rates:")
print("==============")
print(100 * gdsc_base.isna().sum() / gdsc_base.shape[0])
gdsc_base.head(3)

(446146, 9)
Missing rates:
DATASET            0.0
CELL_LINE_NAME     0.0
DRUG_NAME          0.0
DRUG_ID            0.0
SANGER_MODEL_ID    0.0
AUC                0.0
RMSE               0.0
Z_SCORE            0.0
LN_IC50            0.0
dtype: float64


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,SANGER_MODEL_ID,AUC,RMSE,Z_SCORE,LN_IC50
0,GDSC1,MC-CAR,Erlotinib,1,SIDM00636,0.982114,0.022521,-0.189576,2.395685
1,GDSC1,ES3,Erlotinib,1,SIDM00265,0.984816,0.03184,0.508635,3.140923
2,GDSC1,ES5,Erlotinib,1,SIDM00263,0.985693,0.026052,1.284229,3.968757


In [60]:
# Occurence rate for each GDSC dataset.
round(100 * gdsc_base.DATASET.value_counts() / gdsc_base.shape[0])

GDSC1    70.0
GDSC2    30.0
Name: DATASET, dtype: float64

In [61]:
# Save drug response matrix.
gdsc_base.to_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Features

## Gene expression

In [13]:
cl_details = pd.read_excel(RAW_DATA_PATH + 'Cell_Lines_Details.xlsx')
print(cl_details.shape)
print(cl_details.head(3).to_markdown())

(1002, 13)
|    | Sample Name   |   COSMIC identifier | Whole Exome Sequencing (WES)   | Copy Number Alterations (CNA)   | Gene Expression   | Methylation   | Drug       | GDSC                  | GDSC           | Cancer Type             | Microsatellite             | Screen Medium   | Growth Properties   |
|    |               |                     |                                |                                 |                   |               | Response   | Tissue descriptor 1   | Tissue         | (matching TCGA label)   | instability Status (MSI)   |                 |                     |
|    |               |                     |                                |                                 |                   |               |            |                       | descriptor 2   |                         |                            |                 |                     |
|---:|:--------------|--------------------:|:-------------------------------|:--------------------

  warn(msg)


In [33]:
gene_expression = pd.read_csv(RAW_DATA_PATH + 'Cell_line_RMA_proc_basalExp.txt', sep="\t")
print(gene_expression.shape)
print(gene_expression.iloc[:2, :10].to_markdown())

(17737, 1020)
|    | GENE_SYMBOLS   | GENE_title                                   |   DATA.906826 |   DATA.687983 |   DATA.910927 |   DATA.1240138 |   DATA.1240139 |   DATA.906792 |   DATA.910688 |   DATA.1240135 |
|---:|:---------------|:---------------------------------------------|--------------:|--------------:|--------------:|---------------:|---------------:|--------------:|--------------:|---------------:|
|  0 | TSPAN6         | tetraspanin 6 [Source:HGNC Symbol;Acc:11858] |       7.63202 |       7.54867 |       8.71234 |        7.79714 |        7.72927 |       7.07453 |        3.2852 |        6.96161 |
|  1 | TNMD           | tenomodulin [Source:HGNC Symbol;Acc:17757]   |       2.96459 |       2.77772 |       2.64351 |        2.81792 |        2.95774 |       2.88968 |        2.8282 |        2.87475 |


In [15]:
# from src.preprocess.build_drug_response_matrix import (
#     get_gdsc_gene_expression, cosmic_ids_to_cell_line_names
# )

CELL_LINE_DETAILS_FILE = 'Cell_Lines_Details.xlsx'
GENE_EXPRESSION_RAW_FILE = 'Cell_line_RMA_proc_basalExp.txt'

# Return the gene expression dataframe(n_cells x n_genes) for a set of gene symbols for all cell_lines of the GDSC cell line annotation file.
# If the genes are None, return the data for all genes.
gene_expr = get_gdsc_gene_expression(
    path_cell_annotations=RAW_DATA_PATH + CELL_LINE_DETAILS_FILE,
    path_gene_expression=RAW_DATA_PATH + GENE_EXPRESSION_RAW_FILE 
)
print(gene_expr.shape)
gene_expr.head(3)

Note: 50 Cosmic IDs not found in cell annotation data: 
['906815', '1330955', '907284', '1330944', '1330943', '1298232', '1299076', '1299051', '906829', '1290773', '907049', '909730', '910946', '907391', '687509', '1327761', '753536', '1240152', '925342', '687814', '1479994', '1479991', '1298355', '1331026', '1298150', '1479993', '11223344', '1330945', '1479992', '998179', '1331060', '1331028', '1503362.1', '906850', '1240211', '1299049', '908119', '1331031', '907785', '1330983.1', '906867', '1723793', '1723794', '1240156', '1299067', '906803', '1298154', '909976.1', '1659787', '905954.1']
(1018, 17737)


  warn(msg)


Sample Name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,LINC00526,PPY2,nan,nan.1,KRT18P55,nan.2,POLRMTP1,UBL5P2,TBC1D3P5,nan.3
CAL-120,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,5.369039,...,6.786925,2.997054,3.109774,7.882377,3.331134,2.852537,3.130696,9.986616,3.073724,7.284733
DMS-114,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,7.209653,...,5.317911,3.263745,3.059424,8.681302,2.992611,2.776771,3.260982,9.002814,3.000182,8.504804
CAL-51,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,5.120747,...,3.143006,3.112145,2.930254,8.707886,2.886574,2.685307,3.176239,9.113243,2.916274,7.059092


In [16]:
LANDMARK_GENES_FILE = 'landmark_genes.csv' 

landmark_genes = pd.read_csv(RAW_DATA_PATH + LANDMARK_GENES_FILE, sep="\t")
print(landmark_genes.shape)
print(landmark_genes.head(3).to_markdown())


(978, 7)
|    |   Entrez ID | Symbol   | Name                   | Gene Family                | Type     |   RNA-Seq Correlation |   RNA-Seq Correlation Self-Rank |
|---:|------------:|:---------|:-----------------------|:---------------------------|:---------|----------------------:|--------------------------------:|
|  0 |        3638 | INSIG1   | insulin induced gene 1 | nan                        | landmark |                   nan |                             nan |
|  1 |        2309 | FOXO3    | forkhead box O3        | Forkhead boxes             | landmark |                   nan |                             nan |
|  2 |        1001 | CDH3     | cadherin 3             | Type I classical cadherins | landmark |                   nan |                             nan |


In [17]:
# Choose only the cell-line columns of the gene expressions table that are in the landmark gene file.
inter_cols = list(set(gene_expr.columns).intersection(set(landmark_genes.Symbol)))
gene_expr_sparse = gene_expr[inter_cols]
gene_expr_sparse.columns.rename('CELL_LINE_NAME', inplace=True)
print(gene_expr_sparse.shape)
gene_expr_sparse.head(3)

(1018, 908)


CELL_LINE_NAME,LPAR2,PEX11A,MAPKAPK2,RSU1,FIS1,PRKACA,BZW2,CD40,TLR4,HOMER2,...,CXCR4,NOTCH1,PLEKHM1,TBX2,RRP1B,MLLT11,UBE3B,VAPB,CDK5R1,STAT5B
CAL-120,5.519982,4.759168,4.300245,8.369014,9.344897,3.317099,9.191766,3.582643,2.993602,3.272433,...,2.835618,3.189415,6.167946,3.106098,7.621849,5.515228,4.08086,4.888738,4.164141,3.608917
DMS-114,6.54149,5.977963,3.612725,9.685341,9.122008,3.529718,6.874194,6.943208,3.053156,4.044992,...,3.50728,3.174291,5.271099,7.758626,8.088948,7.860457,4.212928,4.780611,4.723101,4.345699
CAL-51,7.489398,4.040322,4.8883,8.332596,9.567052,3.27217,9.779323,3.720286,2.802697,3.619598,...,2.971266,3.671769,5.784561,2.860879,8.447288,4.578649,4.108093,4.850897,2.921327,4.064676


In [18]:
gdsc_base = pd.read_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

cols_to_join_on = ['CELL_LINE_NAME']
join_gdsc_geneexpr = gdsc_base.merge(right=gene_expr_sparse,
                                     left_on=['CELL_LINE_NAME'],
                                     right_index=True,
                                     how='left',
                                     suffixes=['_gdsc', '_geneexpr'])
print(join_gdsc_geneexpr.shape)
join_gdsc_geneexpr.head(3)

(446146, 916)


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,AUC,RMSE,Z_SCORE,LN_IC50,LPAR2,PEX11A,...,CXCR4,NOTCH1,PLEKHM1,TBX2,RRP1B,MLLT11,UBE3B,VAPB,CDK5R1,STAT5B
0,GDSC1,MC-CAR,Erlotinib,1,0.982114,0.022521,-0.189576,2.395685,5.377255,4.000075,...,9.345843,3.631735,6.312806,3.16956,8.001636,5.106525,3.911568,5.059999,6.212202,5.440448
1,GDSC1,ES3,Erlotinib,1,0.984816,0.03184,0.508635,3.140923,6.288406,5.435063,...,3.00556,3.435098,6.188079,2.935917,8.226306,7.938137,4.489525,5.751381,4.490407,3.992091
2,GDSC1,ES5,Erlotinib,1,0.985693,0.026052,1.284229,3.968757,6.166101,6.351613,...,3.852866,3.490747,6.34509,3.02521,8.219939,8.871089,4.315447,5.074429,5.300401,4.583204


In [19]:
# Save the GDSC table with the gene expression information to a file.
join_gdsc_geneexpr.to_pickle(PROCESSED_DATA_PATH + 'gexpr_full.pkl')

## Copy number variation - picnic

In [29]:
start = time.time()
cnv_picnic_v0 = pd.read_csv(RAW_DATA_PATH + 'cnv_abs_copy_number_picnic_20191101.csv', sep=",", header=1)
cnv_picnic_v0.rename(columns={
    'Unnamed: 1': 'GENE_SYMBOL',
    'model_name': 'GENE_ID'}, inplace=True)
cnv_picnic_v0 = cnv_picnic_v0.iloc[1:, :]
print(f"File `cnv_abs_copy_number_picnic_20191101.csv` took {time.time()-start:.5f} seconds to import. \nShape: {cnv_picnic_v0.shape}")
cnv_picnic_v0.head(3)

File `cnv_abs_copy_number_picnic_20191101.csv` took 3.31045 seconds to import. 
Shape: (24502, 988)


Unnamed: 0,GENE_ID,GENE_SYMBOL,M14,TE-12,TMK-1,STS-0421,PL4,PCI-4B,PCI-30,HSC-39,...,451Lu,MMAc-SF,BE-13,MC-IXC,Ramos-2G6-4C10,CGTH-W-1,H9,GR-ST,YMB-1-E,MM1S
1,SIDG00001,A1BG,3.0,3.0,3.0,4.0,6.0,3.0,2.0,3.0,...,4.0,3.0,4.0,2.0,2.0,4.0,4.0,2.0,3.0,2.0
2,SIDG00003,A1CF,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,...,4.0,2.0,3.0,1.0,2.0,3.0,4.0,2.0,3.0,2.0
3,SIDG00004,A2M,3.0,3.0,2.0,4.0,9.0,3.0,3.0,3.0,...,5.0,3.0,4.0,2.0,2.0,4.0,3.0,2.0,5.0,2.0


In [32]:
print(cnv_picnic_v0.iloc[:3, :10].to_markdown())

|    | GENE_ID   | GENE_SYMBOL   |   M14 |   TE-12 |   TMK-1 |   STS-0421 |   PL4 |   PCI-4B |   PCI-30 |   HSC-39 |
|---:|:----------|:--------------|------:|--------:|--------:|-----------:|------:|---------:|---------:|---------:|
|  1 | SIDG00001 | A1BG          |     3 |       3 |       3 |          4 |     6 |        3 |        2 |        3 |
|  2 | SIDG00003 | A1CF          |     3 |       3 |       3 |          4 |     3 |        3 |        3 |        3 |
|  3 | SIDG00004 | A2M           |     3 |       3 |       2 |          4 |     9 |        3 |        3 |        3 |


Note that here the columns are the cell-line names and the row are indentified by the genes.

In [27]:
cnv_picnic_v1 = cnv_picnic_v0.iloc[:, 1:].T
cnv_picnic_v1 = cnv_picnic_v1.rename(columns=cnv_picnic_v1.iloc[0]).drop(cnv_picnic_v1.index[0])
print(cnv_picnic_v1.shape)
cnv_picnic_v1.head(3)

(986, 24502)


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A2ML1-AS1,A2ML1-AS2,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZZEF1,ZZZ3
M14,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,...,3.0,2.0,2.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0
TE-12,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,...,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
TMK-1,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,...,3.0,2.0,2.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0


In [35]:
# ------------------------------------------------------------------- #
# Sparse down the gene column dimension by only using landmark genes. #
# ------------------------------------------------------------------- #
LANDMARK_GENES_FILE = 'landmark_genes.csv' 

landmark_genes = pd.read_csv(RAW_DATA_PATH + LANDMARK_GENES_FILE, sep="\t")
print(landmark_genes.shape)

# Check how many cell line columns of the cnv picnic table are in the landmark gene file.
count, cols_to_keep = 0, []
for c in cnv_picnic_v1.columns[cnv_picnic_v1.columns != 'nan']:
    if c in landmark_genes.Symbol.tolist(): 
        count += 1
        cols_to_keep.append(c)

cnv_picnic_v2 = cnv_picnic_v1[cols_to_keep]
assert cnv_picnic_v2.shape[1] == len(cols_to_keep)
print(cnv_picnic_v2.shape)
cnv_picnic_v2.head(3)

(978, 7)
(986, 966)


Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
M14,3.0,3.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,...,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0
TE-12,4.0,3.0,3.0,2.0,3.0,4.0,2.0,4.0,2.0,2.0,...,3.0,4.0,5.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0
TMK-1,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,...,2.0,2.0,5.0,3.0,3.0,2.0,3.0,3.0,2.0,4.0


In [37]:
gdsc_base = pd.read_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Join the sparsed CNV data to the GDSC table.
cols_to_join_on = ['CELL_LINE_NAME']
join_gdsc_cnv_picnic = gdsc_base.merge(right=cnv_picnic_v2,
                                       left_on=['CELL_LINE_NAME'],
                                       right_index=True,
                                       how='left',
                                       suffixes=['_gdsc', '_cnvp'])
print(join_gdsc_cnv_picnic.shape)
join_gdsc_cnv_picnic.head(3)

(446146, 974)


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,AUC,RMSE,Z_SCORE,LN_IC50,AARS,ABCB6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,GDSC1,MC-CAR,Erlotinib,1,0.982114,0.022521,-0.189576,2.395685,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
1,GDSC1,ES3,Erlotinib,1,0.984816,0.03184,0.508635,3.140923,2.0,2.0,...,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0
2,GDSC1,ES5,Erlotinib,1,0.985693,0.026052,1.284229,3.968757,3.0,4.0,...,2.0,3.0,3.0,3.0,4.0,6.0,4.0,3.0,4.0,3.0


In [38]:
# Save the GDSC table with the copy number picnic information to a file.
join_gdsc_cnv_picnic.to_pickle(PROCESSED_DATA_PATH + 'cnvp_full.pkl')

## Copy number variation - gistic

In [39]:
start = time.time()
cnv_gistic_v0 = pd.read_csv(RAW_DATA_PATH + 'cnv_gistic_20191101.csv', sep=",", header=1)
cnv_gistic_v0.rename(columns={
    'Unnamed: 1': 'GENE_SYMBOL',
    'model_name': 'GENE_ID'}, inplace=True)
cnv_gistic_v0 = cnv_gistic_v0.iloc[1:, :]
print(f"File `cnv_gistic_20191101.csv` took {time.time()-start:.5f} seconds to import. \nShape: {cnv_gistic_v0.shape}")
cnv_gistic_v0.head(3)

File `cnv_gistic_20191101.csv` took 3.35268 seconds to import. 
Shape: (20669, 980)


Unnamed: 0,GENE_ID,GENE_SYMBOL,M14,TE-12,TMK-1,STS-0421,PL4,PCI-4B,PCI-30,HSC-39,...,451Lu,MMAc-SF,BE-13,MC-IXC,Ramos-2G6-4C10,CGTH-W-1,H9,GR-ST,YMB-1-E,MM1S
1,SIDG00001,A1BG,0.0,-1.0,0.0,0.0,1.0,-1.0,0.0,0.0,...,0.0,-1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,SIDG00002,A1BG-AS1,0.0,-1.0,0.0,0.0,1.0,-1.0,0.0,0.0,...,0.0,-1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,SIDG00003,A1CF,-1.0,0.0,0.0,1.0,-1.0,0.0,1.0,0.0,...,0.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [40]:
print(cnv_gistic_v0.iloc[:3, :10].to_markdown())

|    | GENE_ID   | GENE_SYMBOL   |   M14 |   TE-12 |   TMK-1 |   STS-0421 |   PL4 |   PCI-4B |   PCI-30 |   HSC-39 |
|---:|:----------|:--------------|------:|--------:|--------:|-----------:|------:|---------:|---------:|---------:|
|  1 | SIDG00001 | A1BG          |     0 |      -1 |       0 |          0 |     1 |       -1 |        0 |        0 |
|  2 | SIDG00002 | A1BG-AS1      |     0 |      -1 |       0 |          0 |     1 |       -1 |        0 |        0 |
|  3 | SIDG00003 | A1CF          |    -1 |       0 |       0 |          1 |    -1 |        0 |        1 |        0 |


Note that here the columns are the cell-line names and the row are indentified by the genes.

In [41]:
cnv_gistic_v1 = cnv_gistic_v0.iloc[:, 1:].T
cnv_gistic_v1 = cnv_gistic_v1.rename(columns=cnv_gistic_v1.iloc[0]).drop(cnv_gistic_v1.index[0])
print(cnv_gistic_v1.shape)
cnv_gistic_v1.head(3)

(978, 20669)


Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2ML1,A2MP1,A4GALT,A4GNT,AAAS,AACS,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
M14,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,-1.0,0.0,1.0,-1.0,1.0,1.0,1.0,2.0,2.0,2.0
TE-12,-1.0,-1.0,0.0,1.0,1.0,1.0,-1.0,0.0,0.0,0.0,...,-1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
TMK-1,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,...,-1.0,1.0,0.0,1.0,-2.0,0.0,0.0,2.0,-1.0,1.0


In [42]:
# ------------------------------------------------------------------- #
# Sparse down the gene column dimension by only using landmark genes. #
# ------------------------------------------------------------------- #
LANDMARK_GENES_FILE = 'landmark_genes.csv' 

landmark_genes = pd.read_csv(RAW_DATA_PATH + LANDMARK_GENES_FILE, sep="\t")
print(landmark_genes.shape)

# Check how many cell line columns of the cnv picnic table are in the landmark gene file.
count, cols_to_keep = 0, []
for c in cnv_gistic_v1.columns[cnv_gistic_v1.columns != 'nan']:
    if c in landmark_genes.Symbol.tolist(): 
        count += 1
        cols_to_keep.append(c)

cnv_gistic_v2 = cnv_gistic_v1[cols_to_keep]
assert cnv_gistic_v2.shape[1] == len(cols_to_keep)
print(cnv_gistic_v2.shape)
cnv_gistic_v2.head(3)

(978, 7)
(978, 938)


Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
M14,0.0,0.0,1.0,0.0,1.0,-1.0,0.0,0.0,0.0,-1.0,...,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
TE-12,1.0,-1.0,0.0,-1.0,0.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,1.0,1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,1.0
TMK-1,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,...,-1.0,-1.0,1.0,0.0,0.0,-1.0,1.0,0.0,-1.0,1.0


In [43]:
gdsc_base = pd.read_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Join the sparsed CNV data to the GDSC table.
cols_to_join_on = ['CELL_LINE_NAME']
join_gdsc_cnv_gistic = gdsc_base.merge(right=cnv_gistic_v2,
                                       left_on=['CELL_LINE_NAME'],
                                       right_index=True,
                                       how='left',
                                       suffixes=['_gdsc', '_cnvg'])
print(join_gdsc_cnv_gistic.shape)
join_gdsc_cnv_gistic.head(3)

(446146, 946)


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,AUC,RMSE,Z_SCORE,LN_IC50,AARS,ABCB6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,GDSC1,MC-CAR,Erlotinib,1,0.982114,0.022521,-0.189576,2.395685,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
1,GDSC1,ES3,Erlotinib,1,0.984816,0.03184,0.508635,3.140923,0.0,0.0,...,-1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,GDSC1,ES5,Erlotinib,1,0.985693,0.026052,1.284229,3.968757,-1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,-1.0,0.0,-1.0


In [44]:
# Save the GDSC table with the copy number gistic information to a file.
join_gdsc_cnv_gistic.to_pickle(PROCESSED_DATA_PATH + 'cnvg_full.pkl')

## Mutation

In [52]:
start = time.time()
mutations_v0 = pd.read_csv(RAW_DATA_PATH + 'mutations_all_20220315.csv', sep=",", header=0)
print(f"File `mutations_all_20220315.csv` took {time.time()-start:.5f} seconds to import. \nShape: {mutations_v0.shape}")
mutations_v0.head(5)

File `mutations_all_20220315.csv` took 20.57474 seconds to import. 
Shape: (8322616, 13)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name
0,SIDG36104,SOX30,SIDM02021,-,r.?,-,False,False,intronic,0.2703,False,Sanger,HCM-SANG-1082-C15
1,SIDG37647,THG1L,SIDM02021,-,r.?,-,False,False,intronic,0.75,False,Sanger,HCM-SANG-1082-C15
2,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,intronic,0.6452,False,Sanger,HCM-SANG-1082-C15
3,SIDG16920,LSM11,SIDM02021,p.?,r.3207delU,c.?,False,False,3prime_UTR_variant,0.7692,False,Sanger,HCM-SANG-1082-C15
4,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,downstream,0.3571,False,Sanger,HCM-SANG-1082-C15


In [53]:
print(mutations_v0.head(3).to_markdown())

|    | gene_id   | gene_symbol   | model_id   | protein_mutation   | rna_mutation   | cdna_mutation   | cancer_driver   | cancer_predisposition_variant   | effect   |    vaf | coding   | source   | model_name        |
|---:|:----------|:--------------|:-----------|:-------------------|:---------------|:----------------|:----------------|:--------------------------------|:---------|-------:|:---------|:---------|:------------------|
|  0 | SIDG36104 | SOX30         | SIDM02021  | -                  | r.?            | -               | False           | False                           | intronic | 0.2703 | False    | Sanger   | HCM-SANG-1082-C15 |
|  1 | SIDG37647 | THG1L         | SIDM02021  | -                  | r.?            | -               | False           | False                           | intronic | 0.75   | False    | Sanger   | HCM-SANG-1082-C15 |
|  2 | SIDG16920 | LSM11         | SIDM02021  | -                  | r.?            | -               | False           | False 

In [63]:
gdsc_base = pd.read_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Find the CELL_LINE_NAME's per SANGER_MODEL_ID.
celllines_per_sangermodelid = gdsc_base[['SANGER_MODEL_ID', 'CELL_LINE_NAME']].groupby('SANGER_MODEL_ID')['CELL_LINE_NAME'].nunique()
counts_per_sangermodelid = celllines_per_sangermodelid.values

assert (counts_per_sangermodelid != 1).any() == False
assert (counts_per_sangermodelid == 1).all()

# Only take the interested columns for the mapping.
gdsc_mapping_subset = gdsc_base[['SANGER_MODEL_ID', 'CELL_LINE_NAME']]

# Only take the unique SANGER_MODEL_ID's, since these have a 1-to-1 relationship to the CELL_LINE_NAME's anyways.
gdsc_mapping_subset = gdsc_mapping_subset.groupby('SANGER_MODEL_ID').first().reset_index(level=0)

# Join the CELL_LINE_NAME's onto the mutations_all dataset, based on the model_id.
mutations_v1 = mutations_v0.merge(right=gdsc_mapping_subset,
                                  left_on='model_id',
                                  right_on='SANGER_MODEL_ID',
                                  how='left')
print("We joined the mutations dataset with the GDSC table to get the SANGER_MODEL_ID & CELL_LINE_NAME for each row in the mutations dataset.")                                  
print(f"Shape: {mutations_v1.shape}")
mutations_v1.head(5)

We joined the mutations dataset with the GDSC table to get the SANGER_MODEL_ID & CELL_LINE_NAME for each row in the mutations dataset.
Shape: (8322616, 15)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME
0,SIDG36104,SOX30,SIDM02021,-,r.?,-,False,False,intronic,0.2703,False,Sanger,HCM-SANG-1082-C15,,
1,SIDG37647,THG1L,SIDM02021,-,r.?,-,False,False,intronic,0.75,False,Sanger,HCM-SANG-1082-C15,,
2,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,intronic,0.6452,False,Sanger,HCM-SANG-1082-C15,,
3,SIDG16920,LSM11,SIDM02021,p.?,r.3207delU,c.?,False,False,3prime_UTR_variant,0.7692,False,Sanger,HCM-SANG-1082-C15,,
4,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,downstream,0.3571,False,Sanger,HCM-SANG-1082-C15,,


Note: Each `CELL_LINE_NAME` has only exactly one `SANGER_MODEL_ID`.

In [64]:
 # Only taking the ones with a CELL_LINE_NAME.
mutations_v2 = mutations_v1[mutations_v1['CELL_LINE_NAME'].notna()]
print("We excluded all rows which have CELL_LINE_NAME as NaN.")
print(f"Number of unique `gene_symbol`s in the mutations dataset: {len(np.unique(mutations_v2.gene_symbol))}")
print(f"Shape: {mutations_v2.shape}")
mutations_v2.head(5)

We excluded all rows which have CELL_LINE_NAME as NaN.
Number of unique `gene_symbol`s in the mutations dataset: 20450
Shape: (4574963, 15)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME
19,SIDG40054,UAP1,SIDM00615,-,r.?,-,False,False,intronic,0.2857,False,Sanger,JHH-6,SIDM00615,JHH-6
20,SIDG13175,IPO11,SIDM01119,-,r.?,-,False,False,intronic,0.3511,False,Sanger,NCI-H727,SIDM01119,NCI-H727
21,SIDG14638,LIN9,SIDM00933,-,r.?,-,False,False,downstream,1.0,False,Sanger,CAL-51,SIDM00933,CAL-51
22,SIDG07173,DYM,SIDM00933,-,r.?,-,False,False,intronic,0.5294,False,Sanger,CAL-51,SIDM00933,CAL-51
23,SIDG08926,FCAMR,SIDM00620,-,r.?,-,False,False,intronic,0.625,False,Sanger,D-336MG,SIDM00620,D-336MG


In [71]:
LANDMARK_GENES_FILE = 'landmark_genes.csv' 

landmark_genes = pd.read_csv(RAW_DATA_PATH + LANDMARK_GENES_FILE, sep="\t")
print("We use the landmark genes to sparse down the dimension.")

# Take only the rows which have a `gene_symbol` which is also present in the landmark genes table.
mutations_v3 = mutations_v2.merge(right=landmark_genes['Symbol'],
                                  left_on='gene_symbol',
                                  right_on='Symbol')
print("We merged the most recent dataset mutations_v2 with the landmark genes to sparse it down.")                                  
print(f"Shape: {mutations_v3.shape}")
mutations_v3.head(3)  

We use the landmark genes to sparse down the dimension.
We merged the most recent dataset mutations_v2 with the landmark genes to sparse it down.
Shape: (235906, 16)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME,Symbol
0,SIDG40702,VGLL4,SIDM00330,-,r.?,-,False,False,intronic,0.9926,False,Sanger,LAN-6,SIDM00330,LAN-6,VGLL4
1,SIDG40702,VGLL4,SIDM00944,p.?,r.407-7c>u,c.?,False,False,splice_region,0.619,False,Broad,CL-11,SIDM00944,CL-11,VGLL4
2,SIDG40702,VGLL4,SIDM00373,p.?,r.65c>u,c.?,False,False,5prime_UTR_variant,0.4118,False,Broad,SNG-M,SIDM00373,SNG-M,VGLL4


In [72]:
mutations_v4 = mutations_v3[[
    'CELL_LINE_NAME',
    'gene_symbol',
    'model_id',
    'protein_mutation',
    'rna_mutation',
    'cdna_mutation',
    'cancer_driver',
    'vaf'
]]
print("We only took a chosen set of columns.")
n_v4 = mutations_v4.shape[0]
uniq_cl = len(np.unique(mutations_v4.CELL_LINE_NAME))
uniq_gs = len(np.unique(mutations_v4.gene_symbol))
uniq_pm = len(np.unique(mutations_v4.protein_mutation))
uniq_rna = len(np.unique(mutations_v4.rna_mutation))
uniq_cdna = len(np.unique(mutations_v4.cdna_mutation))

print(f"""Number of unique ...
    CELL_LINE_NAME's   : {uniq_cl:6.0f} ({100*uniq_cl/n_v4:2.2f}% out of all rows)
    gene_symbol's      : {uniq_gs:6.0f} ({100*uniq_gs/n_v4:2.2f}% out of all rows)
    protein_mutation's : {uniq_pm:6.0f} ({100*uniq_pm/n_v4:2.2f}% out of all rows)
    rna_mutation's     : {uniq_rna:6.0f} ({100*uniq_rna/n_v4:2.2f}% out of all rows)
    cdna_mutation's    : {uniq_cdna:6.0f} ({100*uniq_cdna/n_v4:2.2f}% out of all rows)
    cancer_driver's    : {len(np.unique(mutations_v4.cancer_driver)):6.0f} ({np.unique(mutations_v4.cancer_driver)})
""")
print(f"Shape: {mutations_v4.shape}")
mutations_v4.head(5)

We only took a chosen set of columns.
Number of unique ...
    CELL_LINE_NAME's   :    983 (0.42% out of all rows)
    gene_symbol's      :    956 (0.41% out of all rows)
    protein_mutation's :  25113 (10.65% out of all rows)
    rna_mutation's     :  28392 (12.04% out of all rows)
    cdna_mutation's    :  22196 (9.41% out of all rows)
    cancer_driver's    :      2 ([False  True])

Shape: (235906, 8)


Unnamed: 0,CELL_LINE_NAME,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,vaf
0,LAN-6,VGLL4,SIDM00330,-,r.?,-,False,0.9926
1,CL-11,VGLL4,SIDM00944,p.?,r.407-7c>u,c.?,False,0.619
2,SNG-M,VGLL4,SIDM00373,p.?,r.65c>u,c.?,False,0.4118
3,JHOS-2,VGLL4,SIDM00305,-,r.?,-,False,0.5698
4,NCI-H1436,VGLL4,SIDM00697,-,r.?,-,False,0.9677


In [83]:
mutations_v5 = pd.pivot_table(
    data    = mutations_v4,
    values  = 'cancer_driver',
    index   = ['CELL_LINE_NAME'],
    columns = ['gene_symbol'],
    aggfunc = np.sum,
    dropna  = False
)

# Set mutation values: 1.0=mutation, 0.0=no_mutation
mutations_v5[mutations_v5==0.0] = 1.0
mutations_v5[np.isnan(mutations_v5)] = 0.0

mutations_v5['CELL_LINE_NAME'] = mutations_v5.index
mutations_v5.insert(0, 'CELL_LINE_NAME', mutations_v5.pop('CELL_LINE_NAME'))
mutations_v5.reset_index(drop=True, inplace=True)
mutations_v5.columns.name = 'GENE_SYMBOL'

print(f"Shape: {mutations_v5.shape}")
mutations_v5.head(5)

Shape: (983, 957)


GENE_SYMBOL,CELL_LINE_NAME,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,22RV1,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,23132-87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,42-MG-BA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,451Lu,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5637,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
gdsc_base = pd.read_pickle(PROCESSED_DATA_PATH + 'drm_full.pkl')

# Join the sparsed CNV data to the GDSC table.
join_gdsc_mut= gdsc_base.merge(right=mutations_v5,
                               on=['CELL_LINE_NAME'],
                               how='left',
                               suffixes=['_gdsc', '_mut'])
print(join_gdsc_mut.shape)
join_gdsc_mut.head(3)

(446146, 965)


Unnamed: 0,DATASET,CELL_LINE_NAME,DRUG_NAME,DRUG_ID,SANGER_MODEL_ID,AUC,RMSE,Z_SCORE,LN_IC50,ABCB6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,GDSC1,MC-CAR,Erlotinib,1,SIDM00636,0.982114,0.022521,-0.189576,2.395685,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,GDSC1,ES3,Erlotinib,1,SIDM00265,0.984816,0.03184,0.508635,3.140923,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GDSC1,ES5,Erlotinib,1,SIDM00263,0.985693,0.026052,1.284229,3.968757,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
# Save the GDSC table with the mutational information to a file.
join_gdsc_mut.to_pickle(PROCESSED_DATA_PATH + 'mut_full.pkl')

# TODO 

- [x] Write generalized methods for all datasets and integrate in separate `.py` scripts
- [ ] Later on: separate table/dataset with drug concentration for each row in the final drug-response matrix. (use raw GDSC dataset for that)