In [2]:
import sys
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set_theme(style="white")

# Copy Number Variation Insights

The objective of this notebook is to map copy number variation (CNV) information to the GDSC database. In folder `knowledge_base/table_contents.md` it is described from where the CNV and GDSC data is originating from. In the previous notebook `02_GDSC_map_GenExpr.ipynb` we mapped the GDSC data to gene-expression information. The resulting table is saved in `temp_datasets/joined_gdsc_geneexpr.pkl` which will be the starting point for this notebook.

In [3]:
!pwd
!find ../../datasets/gdsc/cell_line_genomic_data -name '*cnv*.csv' -ls

/Users/cwoest/Documents/Academics/Data_Science_UP/master_thesis/material/GNN-material
37667852   189544 -rw-------    1 cwoest           staff            97043230 Nov  1  2019 ../../datasets/gdsc/cell_line_genomic_data/cnv_abs_copy_number_picnic_20191101.csv
37667853   169408 -rw-------    1 cwoest           staff            86734917 Nov  1  2019 ../../datasets/gdsc/cell_line_genomic_data/cnv_gistic_20191101.csv


The files we will use for mapping are
- `cnv_abs_copy_number_picnic_20191101.csv`
- `cnv_gistic_20191101.csv`

In [4]:
PATH_TO_CNV_DATA = '../../datasets/gdsc/cell_line_genomic_data/'
PATH_TO_SAVE_DATA_TO = '../../datasets/gdsc/my_datasets/'

CNV_PICNIC = 'cnv_abs_copy_number_picnic_20191101.csv'
CNV_GISTIC = 'cnv_gistic_20191101.csv'

In [5]:
# First, we load the GDSC dataset from the previous notebook.
GDSC = pd.read_pickle(f'{PATH_TO_SAVE_DATA_TO}joined_gdsc_geneexpr.pkl')
print(GDSC.shape)
GDSC.head(3)

(446521, 922)


Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,...,5.99576,9.337588,7.468226,3.71627,5.363887,6.188079,12.281947,4.794624,3.588528,6.785201
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,...,6.939741,8.688176,7.085349,3.688222,4.572119,6.34509,12.276166,4.114092,5.768098,7.505155


--- 

## CNV PICNIC

In this subsection we will investigate and potentially map the `cnv_abs_copy_number_picnic_20191101.csv` content to the `GDSC` dataset.

In [6]:
CNV_PICNIC

'cnv_abs_copy_number_picnic_20191101.csv'

In [7]:
start = time.time()
cnv_picnic_data = pd.read_csv(f'{PATH_TO_CNV_DATA}{CNV_PICNIC}', sep=",", header=1)
print(f"File `{CNV_PICNIC}` took {time.time()-start:.5f} seconds to import. \nShape: {cnv_picnic_data.shape}")
cnv_picnic_data.head(3)

File `cnv_abs_copy_number_picnic_20191101.csv` took 2.47200 seconds to import. 
Shape: (24503, 988)


Unnamed: 0,model_name,Unnamed: 1,M14,TE-12,TMK-1,STS-0421,PL4,PCI-4B,PCI-30,HSC-39,...,451Lu,MMAc-SF,BE-13,MC-IXC,Ramos-2G6-4C10,CGTH-W-1,H9,GR-ST,YMB-1-E,MM1S
0,gene_id,symbol,,,,,,,,,...,,,,,,,,,,
1,SIDG00001,A1BG,3.0,3.0,3.0,4.0,6.0,3.0,2.0,3.0,...,4.0,3.0,4.0,2.0,2.0,4.0,4.0,2.0,3.0,2.0
2,SIDG00003,A1CF,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,...,4.0,2.0,3.0,1.0,2.0,3.0,4.0,2.0,3.0,2.0


In [8]:
l = []
for c in GDSC.CELL_LINE_NAME.values: 
    if c in cnv_picnic_data.columns:
        l.append(c)
print(len(l))

441777


In [9]:
l = []
for c in GDSC.columns: 
    if c in cnv_picnic_data['Unnamed: 1'].values:
        l.append(c)
print(len(l))

900


`cnv_picnic_data` contains 
- As columns `CELL_LINES_NAMES`
- Under column `Unnamed: 1` the gene symbols which are columns of the `GDSC` dataset.

A little tweaking needs to be done to clean the `cnv_picnic_data` dataset.

In [10]:
cnv_picnic_data_v2 = cnv_picnic_data.copy()
# Cleaner names.
cnv_picnic_data_v2.rename(columns={
    'Unnamed: 1': 'GENE_SYMBOL',
    'model_name': 'GENE_ID'}, inplace=True)
# Drop 1st row.
cnv_picnic_data_v2 = cnv_picnic_data_v2.iloc[1:, :]
print(cnv_picnic_data_v2.shape)
cnv_picnic_data_v2.head(3)

(24502, 988)


Unnamed: 0,GENE_ID,GENE_SYMBOL,M14,TE-12,TMK-1,STS-0421,PL4,PCI-4B,PCI-30,HSC-39,...,451Lu,MMAc-SF,BE-13,MC-IXC,Ramos-2G6-4C10,CGTH-W-1,H9,GR-ST,YMB-1-E,MM1S
1,SIDG00001,A1BG,3.0,3.0,3.0,4.0,6.0,3.0,2.0,3.0,...,4.0,3.0,4.0,2.0,2.0,4.0,4.0,2.0,3.0,2.0
2,SIDG00003,A1CF,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,...,4.0,2.0,3.0,1.0,2.0,3.0,4.0,2.0,3.0,2.0
3,SIDG00004,A2M,3.0,3.0,2.0,4.0,9.0,3.0,3.0,3.0,...,5.0,3.0,4.0,2.0,2.0,4.0,3.0,2.0,5.0,2.0


`cnv_picnic_data_v2`: 
- Dimension: n_GENE_SYMBOLs x (2 + n_CELL_LINE_NAMEs)

In [11]:
# How many of the 986 (=988-2) CELL_LINE_NAME's in cnv_picnic_data_v2 are also in the GDSC dataset?
cnv_cell_line_names = list(np.unique(cnv_picnic_data_v2.iloc[:, 2:].columns))
gdsc_cell_line_names = list(np.unique(GDSC.CELL_LINE_NAME.values))

print(f"""CELL_LINE_NAME insights:
    CNV PICNIC has {len(cnv_cell_line_names):6.0f}  unique cell line names as columns.
    GDSC       has {len(gdsc_cell_line_names):6.0f} unique cell lines names under the column CELL_LINE_NAME.
""")

# How many of the genes (GENE_SYMBOL) in cnv_picnic_data_v2 are also in the GDSC dataset?
cnv_gene_symbols = list(np.unique(cnv_picnic_data_v2.GENE_SYMBOL.values))
gdsc_gene_symbols = list(np.unique(GDSC.iloc[:, 14:].columns))

print(f"""GENE_SYMBOL insights:
    CNV PICNIC has {len(cnv_gene_symbols):6.0f}  unique gene symbols under the column GENE_SYMBOL.
    GDSC       has {len(gdsc_gene_symbols):6.0f} unique gene_symbols as columns.
""")

CELL_LINE_NAME insights:
    CNV PICNIC has    986  unique cell line names as columns.
    GDSC       has    988 unique cell lines names under the column CELL_LINE_NAME.

GENE_SYMBOL insights:
    CNV PICNIC has  24502  unique gene symbols under the column GENE_SYMBOL.
    GDSC       has    908 unique gene_symbols as columns.



If we are adding again more then 900 columns now to the GDSC table, these columns will also be `GENE_SYMBOL` column, just with different information per cell, namely copy number variations per cell instead of gene expressions as before (see previous notebook). Due to that reason we will now create a new GDSC dataframe. Then, in future steps, we add all dataframes on top to create a tensor with `CELL_LINE_NAME x GENE_SYMBOL x FEATURE_INFORMATION`, where `FEATURE_INFORMATION` can be gene expression, copy number variation or mutation information.

In [12]:
# Ignore the GENE_SYMBOL columns from the GDSC dataset.
gdsc_base = GDSC.iloc[:, :14]
gdsc_base.head(3)

Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,DATASET,Z_SCORE,LN_IC50,DRUG_ID
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,GDSC1,-0.189576,2.395685,1
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,GDSC1,0.508635,3.140923,1
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,GDSC1,1.284229,3.968757,1


In [22]:
# Transpose the CNV table such that the GENE_SYMBOLs are the columns and the CELL_LINE_NAMES are the indexes of the rows.
# Copy CNV dataset without GENE_ID column and transpose.
cnv_picnic_data_v3 = cnv_picnic_data_v2.iloc[:, 1:].copy().T
# Ignore the first row and make the GEN_SYMBOL names as the new columns. Also make the previous CELL_LINE_NAMES columns as the new row index.
cnv_picnic_data_v3 = cnv_picnic_data_v3.rename(columns=cnv_picnic_data_v3.iloc[0]).drop(cnv_picnic_data_v3.index[0])
print(cnv_picnic_data_v3.shape)
cnv_picnic_data_v3.head(3)


(986, 24502)


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A2ML1-AS1,A2ML1-AS2,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZZEF1,ZZZ3
M14,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,...,3.0,2.0,2.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0
TE-12,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,...,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
TMK-1,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,...,3.0,2.0,2.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0


This dataset `cnv_picnic_data_v3` could now be added onto the `gdsc_base` to have the CNV values per cell-line and gene. However, over 24,000 columns (genes/gene symbols) is way too much. Analogous to the previous notebook we will sparse this feature space by using a list of landmark_genes. 

### Sparsing the Feature Space

Since the `cnv_picnic_data_v3` table holds $24,502$ genes (columns) we are trying to sparse down this set of columns by using _LINCS landmark gene symbols_.

- LINCS landmark gene symbols are in file `landmark_genes.csv`

In [23]:
FILENAME_LANDMARK_GENES = 'landmark_genes.csv' 

In [25]:
start = time.time()
landmark_genes = pd.read_csv(f'../../datasets/gdsc/{FILENAME_LANDMARK_GENES}', sep="\t")
print(f"File `{FILENAME_LANDMARK_GENES}` took {time.time()-start:.5f} seconds to import. It has shape {landmark_genes.shape}")
landmark_genes.head(3)

File `landmark_genes.csv` took 0.00743 seconds to import. It has shape (978, 7)


Unnamed: 0,Entrez ID,Symbol,Name,Gene Family,Type,RNA-Seq Correlation,RNA-Seq Correlation Self-Rank
0,3638,INSIG1,insulin induced gene 1,,landmark,,
1,2309,FOXO3,forkhead box O3,Forkhead boxes,landmark,,
2,1001,CDH3,cadherin 3,Type I classical cadherins,landmark,,


In [27]:
cnv_picnic_data_v3.head(3)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A2ML1-AS1,A2ML1-AS2,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZZEF1,ZZZ3
M14,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,...,3.0,2.0,2.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0
TE-12,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,...,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
TMK-1,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,...,3.0,2.0,2.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0


In [28]:
# Check how many cell line columns of the gene expressions table are in the landmark gene file.

count, cols_to_keep = 0, []
for c in cnv_picnic_data_v3.columns[cnv_picnic_data_v3.columns != 'nan']:
    if c in landmark_genes.Symbol.tolist(): 
        count += 1
        cols_to_keep.append(c)
        
print(f"""
    Out of {len(cnv_picnic_data_v3.columns[cnv_picnic_data_v3.columns != 'nan'])} non-nan columns in the gene expression file (`cnv_picnic_data_v3`) {count} columns are respresented  in the landmark_genes.csv file.
    Thus, {100*(1-count/len(cnv_picnic_data_v3.columns[cnv_picnic_data_v3.columns != 'nan'])):2.2f}% will get removed.
""")



    Out of 24502 non-nan columns in the gene expression file (`gene_expr`) 966 columns are respresented  in the landmark_genes.csv file.
    Thus, 96.06% will get removed.



Now we sparse down the columns in `cnv_picnic_data_v3` from $24,502$ to all the ones we found in the `landmark_genes.csv` file und the `Symbol` column. This leaves us with $966$ columns. The remaining $96.06% of all columns in `cnv_picnic_data_v3` will get removed. 

In [30]:
cnv_picnic_data_v4 = cnv_picnic_data_v3[cols_to_keep]
assert cnv_picnic_data_v4.shape[1] == len(cols_to_keep)
print(cnv_picnic_data_v4.shape)
cnv_picnic_data_v4.head(5)

(986, 966)


Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
M14,3.0,3.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,...,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0
TE-12,4.0,3.0,3.0,2.0,3.0,4.0,2.0,4.0,2.0,2.0,...,3.0,4.0,5.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0
TMK-1,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,...,2.0,2.0,5.0,3.0,3.0,2.0,3.0,3.0,2.0,4.0
STS-0421,4.0,3.0,5.0,3.0,5.0,4.0,4.0,3.0,4.0,3.0,...,5.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0
PL4,4.0,6.0,6.0,5.0,6.0,6.0,2.0,4.0,5.0,3.0,...,4.0,8.0,6.0,6.0,5.0,4.0,5.0,6.0,4.0,5.0


__Summary__:
> `cnv_picnic_data_v4` - Cell lines as index column and genes as columns, where the genes got sparsed down by using landmark gene information.

### Map CNV Picnic Data to GDSC

In [31]:
print(gdsc_base.shape)
gdsc_base.head(3)

(446521, 14)


Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,DATASET,Z_SCORE,LN_IC50,DRUG_ID
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,GDSC1,-0.189576,2.395685,1
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,GDSC1,0.508635,3.140923,1
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,GDSC1,1.284229,3.968757,1


In [32]:
print(cnv_picnic_data_v4.shape)
cnv_picnic_data_v4.head(5)

(986, 966)


Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
M14,3.0,3.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,2.0,...,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0
TE-12,4.0,3.0,3.0,2.0,3.0,4.0,2.0,4.0,2.0,2.0,...,3.0,4.0,5.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0
TMK-1,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,...,2.0,2.0,5.0,3.0,3.0,2.0,3.0,3.0,2.0,4.0
STS-0421,4.0,3.0,5.0,3.0,5.0,4.0,4.0,3.0,4.0,3.0,...,5.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0
PL4,4.0,6.0,6.0,5.0,6.0,6.0,2.0,4.0,5.0,3.0,...,4.0,8.0,6.0,6.0,5.0,4.0,5.0,6.0,4.0,5.0


In [33]:
col = 'CELL_LINE_NAME'

uniq_gdsc, uniq_cnv_picnic = np.unique(gdsc_base[col]), np.unique(cnv_picnic_data_v4.index)
print(f"""
    Number of unique {col}'s 
        in the GDSC db            : {len(uniq_gdsc)}
        in the CNV Picnic dataset : {len(uniq_cnv_picnic)}

    Number of cell-lines in the GDSC which are not in the CNV Picnic dataset: {len(set(uniq_gdsc) - set(uniq_cnv_picnic))}. 
    Thus, there will be no CNV information for these cell-lines.
""")


    Number of unique CELL_LINE_NAME's 
        in the GDSC db            : 988
        in the CNV Picnic dataset : 986

    Number of cell-lines in the GDSC which are not in the CNV Picnic dataset: 12. 
    Thus, there will be no CNV information for these cell-lines.



In [35]:
# Join the sparsed CNV data to the GDSC table.
cols_to_join_on = ['CELL_LINE_NAME']
join_gdsc_cnv_picnic = gdsc_base.merge(right       = cnv_picnic_data_v4,
                                       left_on     = cols_to_join_on,
                                       right_index = True,
                                       how         = 'left',
                                       suffixes    = ['_gdsc', '_cnv'])
print(join_gdsc_cnv_picnic.shape)
join_gdsc_cnv_picnic.head(3)

(446521, 980)


Unnamed: 0,MASTER_CELL_ID,CELL_ID,CONC,CELL_LINE_NAME,RMSE,AUC,INTENSITY,COSMIC_ID,DRUG_NAME,POSITION,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,49,3137,2.0,MC-CAR,0.022521,0.982114,544404,683665,Erlotinib,14,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
9,1342,2366,2.0,ES3,0.03184,0.984816,404197,684055,Erlotinib,14,...,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0
27,610,2368,2.0,ES5,0.026052,0.985693,797378,684057,Erlotinib,14,...,2.0,3.0,3.0,3.0,4.0,6.0,4.0,3.0,4.0,3.0


- `join_gdsc_cnv_picnic` now contains the IC50 values for cell-line, drug combinations and in addition the copy number variation (CNV) values of the specific cell-line to multiple genes. 

In [43]:
# Save the GDSC table with the copy number variation information to a file.
join_gdsc_cnv_picnic.to_pickle(f'{PATH_TO_SAVE_DATA_TO}joined_gdsc_cnv_picnic.pkl')

--- 

## CNV GISTIC 

In this subsection we will investigate and potentially map the `cnv_gistic_20191101.csv` content to the `GDSC` dataset.