In [104]:
%load_ext autoreload
%autoreload 2

import time
import re
import copy
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

sns.set_theme(style="white")

PATH_TO_CNV_DATA = '../../datasets/gdsc/cell_line_genomic_data/'
PATH_TO_SAVE_DATA_TO = '../../datasets/gdsc/my_datasets/'

MUTATION_FILE = 'mutations_all_20220315.csv'

PATH_TO_GDSC_DATASETS = '../../datasets/gdsc/'
FILENAME_LANDMARK_GENES = 'landmark_genes.csv' 

from config import (
    # Drug features.
    PATH_TO_SAVED_DRUG_FEATURES,
    DRUG_FPS_FINAL_FILE_NAME,
    # Cell line features.
    PATH_TO_SAVED_CL_FEATURES,
    GENE_EXPR_FINAL_FILE_NAME,
    CNV_GISTIC_FINAL_FILE_NAME,
    CNV_PICNIC_FINAL_FILE_NAME,
    PATH_TO_FEATURES    
)

ImportError: cannot import name 'PATH_TO_FEATURES' from 'config' (/Users/cwoest/Documents/Academics/Data_Science_UP/master_thesis/material/GNN-material/config.py)

---

# Create Mutations Dataset


In [2]:
# Get GDSC dataset.
GDSC = pd.read_pickle(f'{PATH_TO_SAVE_DATA_TO}joined_gdsc_geneexpr.pkl')
print(GDSC.shape)
print(GDSC.columns[:20])
GDSC.head(3)

(446521, 922)
Index(['DRUG_ID', 'CELL_LINE_NAME', 'AUC', 'CELL_ID', 'LN_IC50', 'CONC',
       'MASTER_CELL_ID', 'INTENSITY', 'DATASET', 'Z_SCORE', 'DRUG_NAME',
       'RMSE', 'COSMIC_ID', 'POSITION', 'TSPAN6', 'SCYL3', 'BAD', 'LAP3',
       'SNX11', 'CASP10'],
      dtype='object')


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
0,1,MC-CAR,0.982114,3137,2.395685,2.0,49,544404,GDSC1,-0.189576,...,8.355826,8.95168,7.20559,3.277948,3.465672,6.312806,12.112498,3.010237,8.750848,6.199366
9,1,ES3,0.984816,2366,3.140923,2.0,1342,404197,GDSC1,0.508635,...,5.99576,9.337588,7.468226,3.71627,5.363887,6.188079,12.281947,4.794624,3.588528,6.785201
27,1,ES5,0.985693,2368,3.968757,2.0,610,797378,GDSC1,1.284229,...,6.939741,8.688176,7.085349,3.688222,4.572119,6.34509,12.276166,4.114092,5.768098,7.505155


In [3]:
# Get mutations dataset.
start = time.time()
mutations_v1 = pd.read_csv(f'{PATH_TO_CNV_DATA}{MUTATION_FILE}', sep=",", header=0)
print(f"File `{MUTATION_FILE}` took {time.time()-start:.5f} seconds to import. \nShape: {mutations_v1.shape}")
mutations_v1.head(5)

File `mutations_all_20220315.csv` took 15.78615 seconds to import. 
Shape: (8322616, 13)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name
0,SIDG36104,SOX30,SIDM02021,-,r.?,-,False,False,intronic,0.2703,False,Sanger,HCM-SANG-1082-C15
1,SIDG37647,THG1L,SIDM02021,-,r.?,-,False,False,intronic,0.75,False,Sanger,HCM-SANG-1082-C15
2,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,intronic,0.6452,False,Sanger,HCM-SANG-1082-C15
3,SIDG16920,LSM11,SIDM02021,p.?,r.3207delU,c.?,False,False,3prime_UTR_variant,0.7692,False,Sanger,HCM-SANG-1082-C15
4,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,downstream,0.3571,False,Sanger,HCM-SANG-1082-C15


This is now all the mutations information with the `model_id` which can be mapped to the `SANGER_MODEL_ID` from the build GDSC table.

In [4]:
gdsc_base = pd.read_pickle(f'{PATH_TO_SAVE_DATA_TO}gdsc_base_v2.pkl')
print(f"GDSC base shape: {gdsc_base.shape}")
gdsc_base.head(5)

GDSC base shape: (446521, 15)


Unnamed: 0,CONC,SANGER_MODEL_ID,CELL_LINE_NAME,Z_SCORE,MASTER_CELL_ID,DRUG_ID,AUC,LN_IC50,RMSE,CELL_ID,INTENSITY,POSITION,DRUG_NAME,COSMIC_ID,DATASET
0,2.0,SIDM00636,MC-CAR,-0.189576,49,1,0.982114,2.395685,0.022521,3137,544404,14,Erlotinib,683665,GDSC1
9,2.0,SIDM00265,ES3,0.508635,1342,1,0.984816,3.140923,0.03184,2366,404197,14,Erlotinib,684055,GDSC1
27,2.0,SIDM00263,ES5,1.284229,610,1,0.985693,3.968757,0.026052,2368,797378,14,Erlotinib,684057,GDSC1
45,2.0,SIDM00269,ES7,0.08876,71,1,0.972699,2.692768,0.110056,2371,377574,14,Erlotinib,684059,GDSC1
54,2.0,SIDM00203,EW-11,-0.11182,231,1,0.944462,2.478678,0.087011,2375,638065,14,Erlotinib,684062,GDSC1


`gdsc_base_v2.pkl` was build in notebook `01_gdsc_base_table.ipynb`

In [5]:
# Find the CELL_LINE_NAME's per SANGER_MODEL_ID.
celllines_per_sangermodelid = gdsc_base[['SANGER_MODEL_ID', 'CELL_LINE_NAME']].groupby('SANGER_MODEL_ID')['CELL_LINE_NAME'].nunique()
counts_per_sangermodelid = celllines_per_sangermodelid.values

assert (counts_per_sangermodelid != 1).any() == False
assert (counts_per_sangermodelid == 1).all()

Each `CELL_LINE_NAME` has only exactly one `SANGER_MODEL_ID`

In [6]:
# Only take the interested columns for the mapping.
gdsc_mapping_subset = gdsc_base[['SANGER_MODEL_ID', 'CELL_LINE_NAME']]

# Only take the unique SANGER_MODEL_ID's, since these have a 1-to-1 relationship to the CELL_LINE_NAME's anyways.
gdsc_mapping_subset = gdsc_mapping_subset.groupby('SANGER_MODEL_ID').first().reset_index(level=0)
gdsc_mapping_subset.head(3)

Unnamed: 0,SANGER_MODEL_ID,CELL_LINE_NAME
0,SIDM00003,M14
1,SIDM00023,TE-12
2,SIDM00040,TMK-1


In [7]:
# Join the CELL_LINE_NAME's onto the mutations_all dataset, based on the model_id.
mutations_v2 = mutations_v1.merge(right    = gdsc_mapping_subset,
                                  left_on  = 'model_id',
                                  right_on = 'SANGER_MODEL_ID',
                                  how      = 'left')
print("We joined the mutations dataset with the GDSC table to get the SANGER_MODEL_ID & CELL_LINE_NAME for each row in the mutations dataset.")                                  
print(f"mutations_v2 Shape: {mutations_v2.shape}")
mutations_v2.head(5)

We joined the mutations dataset with the GDSC table to get the SANGER_MODEL_ID & CELL_LINE_NAME for each row in the mutations dataset.
mutations_v2 Shape: (8322616, 15)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME
0,SIDG36104,SOX30,SIDM02021,-,r.?,-,False,False,intronic,0.2703,False,Sanger,HCM-SANG-1082-C15,,
1,SIDG37647,THG1L,SIDM02021,-,r.?,-,False,False,intronic,0.75,False,Sanger,HCM-SANG-1082-C15,,
2,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,intronic,0.6452,False,Sanger,HCM-SANG-1082-C15,,
3,SIDG16920,LSM11,SIDM02021,p.?,r.3207delU,c.?,False,False,3prime_UTR_variant,0.7692,False,Sanger,HCM-SANG-1082-C15,,
4,SIDG16920,LSM11,SIDM02021,-,r.?,-,False,False,downstream,0.3571,False,Sanger,HCM-SANG-1082-C15,,


In [8]:
 # Only taking the once with a CELL_LINE_NAME.
mutations_v3 = mutations_v2[mutations_v2['CELL_LINE_NAME'].notna()]
print("We excluded all rows which have CELL_LINE_NAME as NaN.")
print(f"Number of unique `gene_symbol`s in the mutations dataset: {len(np.unique(mutations_v3.gene_symbol))}")
print(f"mutations_v3 Shape: {mutations_v3.shape}")
mutations_v3.head(5)

We excluded all rows which have CELL_LINE_NAME as NaN.
Number of unique `gene_symbol`s in the mutations dataset: 20450
mutations_v3 Shape: (4574963, 15)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME
19,SIDG40054,UAP1,SIDM00615,-,r.?,-,False,False,intronic,0.2857,False,Sanger,JHH-6,SIDM00615,JHH-6
20,SIDG13175,IPO11,SIDM01119,-,r.?,-,False,False,intronic,0.3511,False,Sanger,NCI-H727,SIDM01119,NCI-H727
21,SIDG14638,LIN9,SIDM00933,-,r.?,-,False,False,downstream,1.0,False,Sanger,CAL-51,SIDM00933,CAL-51
22,SIDG07173,DYM,SIDM00933,-,r.?,-,False,False,intronic,0.5294,False,Sanger,CAL-51,SIDM00933,CAL-51
23,SIDG08926,FCAMR,SIDM00620,-,r.?,-,False,False,intronic,0.625,False,Sanger,D-336MG,SIDM00620,D-336MG


In [9]:
start = time.time()
landmark_genes = pd.read_csv(f'{PATH_TO_GDSC_DATASETS}{FILENAME_LANDMARK_GENES}', sep="\t")
print("We use the landmark genes to sparse down the dimension.")
print(f"File `{FILENAME_LANDMARK_GENES}` took {time.time()-start:.5f} seconds to import. It has shape {landmark_genes.shape}")
landmark_genes.head(3)

We use the landmark genes to sparse down the dimension.
File `landmark_genes.csv` took 0.01575 seconds to import. It has shape (978, 7)


Unnamed: 0,Entrez ID,Symbol,Name,Gene Family,Type,RNA-Seq Correlation,RNA-Seq Correlation Self-Rank
0,3638,INSIG1,insulin induced gene 1,,landmark,,
1,2309,FOXO3,forkhead box O3,Forkhead boxes,landmark,,
2,1001,CDH3,cadherin 3,Type I classical cadherins,landmark,,


In [10]:
landmark_genes_v2 = landmark_genes['Symbol']

# Take only the rows which have a `gene_symbol` which is also present in the landmark genes table.
mutations_v4 = mutations_v3.merge(right    = landmark_genes_v2,
                                  left_on  = 'gene_symbol',
                                  right_on = 'Symbol')
print("We merged the most mutations_v3 dataset with the landmark genes to sparse it down.")                                  
print(f"mutations_v4 Shape: {mutations_v4.shape}")
mutations_v4.head(3)  

We merged the most mutations_v3 dataset with the landmark genes to sparse it down.
mutations_v4 Shape: (235906, 16)


Unnamed: 0,gene_id,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,cancer_predisposition_variant,effect,vaf,coding,source,model_name,SANGER_MODEL_ID,CELL_LINE_NAME,Symbol
0,SIDG40702,VGLL4,SIDM00330,-,r.?,-,False,False,intronic,0.9926,False,Sanger,LAN-6,SIDM00330,LAN-6,VGLL4
1,SIDG40702,VGLL4,SIDM00944,p.?,r.407-7c>u,c.?,False,False,splice_region,0.619,False,Broad,CL-11,SIDM00944,CL-11,VGLL4
2,SIDG40702,VGLL4,SIDM00373,p.?,r.65c>u,c.?,False,False,5prime_UTR_variant,0.4118,False,Broad,SNG-M,SIDM00373,SNG-M,VGLL4


In [11]:
mutations_v5 = mutations_v4[[
    'CELL_LINE_NAME',
    'gene_symbol',
    'model_id',
    'protein_mutation',
    'rna_mutation',
    'cdna_mutation',
    'cancer_driver',
    'vaf'
]]
print("We only took a chosen set of columns.")
n_v5 = mutations_v5.shape[0]
uniq_cl = len(np.unique(mutations_v5.CELL_LINE_NAME))
uniq_gs = len(np.unique(mutations_v5.gene_symbol))
uniq_pm = len(np.unique(mutations_v5.protein_mutation))
uniq_rna = len(np.unique(mutations_v5.rna_mutation))
uniq_cdna = len(np.unique(mutations_v5.cdna_mutation))

print(f"""Number of unique ...
    CELL_LINE_NAME's   : {uniq_cl:6.0f} ({100*uniq_cl/n_v5:2.2f}% out of all rows)
    gene_symbol's      : {uniq_gs:6.0f} ({100*uniq_gs/n_v5:2.2f}% out of all rows)
    protein_mutation's : {uniq_pm:6.0f} ({100*uniq_pm/n_v5:2.2f}% out of all rows)
    rna_mutation's     : {uniq_rna:6.0f} ({100*uniq_rna/n_v5:2.2f}% out of all rows)
    cdna_mutation's    : {uniq_cdna:6.0f} ({100*uniq_cdna/n_v5:2.2f}% out of all rows)
    cancer_driver's    : {len(np.unique(mutations_v5.cancer_driver)):6.0f} ({np.unique(mutations_v5.cancer_driver)})
""")
print(f"mutations_v5 Shape: {mutations_v5.shape}")
mutations_v5.head(5)

We only took a chosen set of columns.
Number of unique ...
    CELL_LINE_NAME's   :    983 (0.42% out of all rows)
    gene_symbol's      :    956 (0.41% out of all rows)
    protein_mutation's :  25113 (10.65% out of all rows)
    rna_mutation's     :  28392 (12.04% out of all rows)
    cdna_mutation's    :  22196 (9.41% out of all rows)
    cancer_driver's    :      2 ([False  True])

mutations_v5 Shape: (235906, 8)


Unnamed: 0,CELL_LINE_NAME,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,vaf
0,LAN-6,VGLL4,SIDM00330,-,r.?,-,False,0.9926
1,CL-11,VGLL4,SIDM00944,p.?,r.407-7c>u,c.?,False,0.619
2,SNG-M,VGLL4,SIDM00373,p.?,r.65c>u,c.?,False,0.4118
3,JHOS-2,VGLL4,SIDM00305,-,r.?,-,False,0.5698
4,NCI-H1436,VGLL4,SIDM00697,-,r.?,-,False,0.9677


In [12]:
def retrieve_mutation_positions(mut):
    positions = re.findall(r'\d+', mut)

    # No positional information was found.
    if len(positions) == 0: 
        return (None, None)
    # Only one position was found.
    elif len(positions) == 1:
        return (int(positions[0]), int(positions[0]))
    # Two positions have been found.
    elif len(positions) == 2:
        return (int(positions[0]), int(positions[1]))
    else:
        return ValueError(f"The mutations cell should contain <=2 values, however, it contains {len(positions)}.")

In [13]:
mutations_v6 = copy.deepcopy(mutations_v5)
mutations_v6['protein_mutation_pos'] = mutations_v6.protein_mutation.apply(retrieve_mutation_positions)
mutations_v6['rna_mutation_pos'] = mutations_v6.rna_mutation.apply(retrieve_mutation_positions)
mutations_v6['cdna_mutation_pos'] = mutations_v6.cdna_mutation.apply(retrieve_mutation_positions)
print("We retrieve the positional information for each mutation columns and added them as new columns.")
print(f"Shape when only taking protein_mutations which have a positional information: {mutations_v6[mutations_v6.protein_mutation_pos != (None, None)].shape}")
print(f"Shape when only taking cancer_drivers = True: {mutations_v6[mutations_v6.cancer_driver==True].shape}")
print(f"mutations_v6 Shape: {mutations_v6.shape}")
mutations_v6.head(10)

We retrieve the positional information for each mutation columns and added them as new columns.
Shape when only taking protein_mutations which have a positional information: (44952, 11)
Shape when only taking cancer_drivers = True: (2216, 11)
mutations_v6 Shape: (235906, 11)


Unnamed: 0,CELL_LINE_NAME,gene_symbol,model_id,protein_mutation,rna_mutation,cdna_mutation,cancer_driver,vaf,protein_mutation_pos,rna_mutation_pos,cdna_mutation_pos
0,LAN-6,VGLL4,SIDM00330,-,r.?,-,False,0.9926,"(None, None)","(None, None)","(None, None)"
1,CL-11,VGLL4,SIDM00944,p.?,r.407-7c>u,c.?,False,0.619,"(None, None)","(407, 7)","(None, None)"
2,SNG-M,VGLL4,SIDM00373,p.?,r.65c>u,c.?,False,0.4118,"(None, None)","(65, 65)","(None, None)"
3,JHOS-2,VGLL4,SIDM00305,-,r.?,-,False,0.5698,"(None, None)","(None, None)","(None, None)"
4,NCI-H1436,VGLL4,SIDM00697,-,r.?,-,False,0.9677,"(None, None)","(None, None)","(None, None)"
5,HCC1395,VGLL4,SIDM00884,-,r.?,-,False,1.0,"(None, None)","(None, None)","(None, None)"
6,SNU-175,VGLL4,SIDM00216,-,r.?,-,False,1.0,"(None, None)","(None, None)","(None, None)"
7,CL-40,VGLL4,SIDM00960,p.?,r.480-4g>a,c.83-4G>A,False,0.4615,"(None, None)","(480, 4)","(83, 4)"
8,TOV-21G,VGLL4,SIDM01169,p.?,r.406c>u,c.?,False,0.561,"(None, None)","(406, 406)","(None, None)"
9,Hep3B2-1-7,VGLL4,SIDM00672,-,r.?,-,False,0.4,"(None, None)","(None, None)","(None, None)"


In [14]:
def set_mutation_value(df):
    # Set all values wit ha 

SyntaxError: incomplete input (3441924051.py, line 2)

In [14]:
mutations_all_final_v1 = pd.pivot_table(
    data    = mutations_v6,
    values  = 'cancer_driver',
    index   = ['CELL_LINE_NAME'],
    columns = ['gene_symbol'],
    aggfunc = np.sum,
    dropna  = False
)

"""

"""
# Set mutation values: 1.0=mutation, 0.0=no_mutation
mutations_all_final_v1[mutations_all_final_v1==0.0] = 1.0
mutations_all_final_v1[np.isnan(mutations_all_final_v1)] = 0.0

mutations_all_final_v1['CELL_LINE_NAME'] = mutations_all_final_v1.index
mutations_all_final_v1.insert(0, 'CELL_LINE_NAME', mutations_all_final_v1.pop('CELL_LINE_NAME'))
mutations_all_final_v1.reset_index(drop=True, inplace=True)

print(f"Shape: {mutations_all_final_v1.shape}")
mutations_all_final_v1.head(5)


Shape: (983, 957)


gene_symbol,CELL_LINE_NAME,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,22RV1,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,23132-87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,42-MG-BA,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,451Lu,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5637,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Compare mutations dataset with the other features

In [15]:
# Read other features.
gene_expr = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{GENE_EXPR_FINAL_FILE_NAME}')
cnv_gistic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_GISTIC_FINAL_FILE_NAME}')
cnv_picnic = pd.read_pickle(f'{PATH_TO_SAVED_CL_FEATURES}{CNV_PICNIC_FINAL_FILE_NAME}')
print(f"""Shapes after by the cell-line column:
    gene expr  : {gene_expr.shape}
    cnv gistic : {cnv_gistic.shape}
    cnv picnic : {cnv_picnic.shape}
""")


Shapes after by the cell-line column:
    gene expr  : (446521, 922)
    cnv gistic : (446521, 952)
    cnv picnic : (446521, 980)



In [40]:
print(len(np.unique(gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].DRUG_ID.values)))
print(len(np.unique(gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].FIS1.values)))
print(gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].shape)
print(gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].drop_duplicates().shape)
gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].head(10)

469
1
(505, 922)
(505, 922)


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
188514,133,22RV1,0.386285,3399,-3.5203,1.024,1027,26702,GDSC1,-1.052514,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
197226,134,22RV1,0.53671,3399,0.2465,16.0,1027,138738,GDSC1,-0.671777,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
205830,135,22RV1,0.395485,3399,-3.456605,1.024,1027,157079,GDSC1,-0.392251,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
214596,136,22RV1,0.223736,3399,-2.005449,16.0,1027,20116,GDSC1,-0.782996,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
223308,140,22RV1,0.634219,3399,-4.606677,0.064,1027,64405,GDSC1,-0.471168,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
231975,147,22RV1,0.979699,3399,5.466523,16.0,1027,685810,GDSC1,0.666688,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
240795,150,22RV1,0.971586,3399,2.298556,1.024,1027,451990,GDSC1,-0.547672,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
249642,151,22RV1,0.863996,3399,2.43271,10.24,1027,233360,GDSC1,-0.604453,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
259929,152,22RV1,0.792572,3399,2.149573,16.0,1027,150953,GDSC1,-0.184076,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
269676,153,22RV1,0.958318,3399,1.115831,0.512,1027,492218,GDSC1,0.942878,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826


- There are multiple rows which contain the same `DRUG_ID`-`CELL_LINE_NAME` tuple

In [37]:
gene_expr[gene_expr.CELL_LINE_NAME=='22RV1'].DRUG_ID.value_counts().head(5)

1010    2
1053    2
1049    2
1048    2
1047    2
Name: DRUG_ID, dtype: int64

In [39]:
gene_expr[(gene_expr.CELL_LINE_NAME=='22RV1') & (gene_expr.DRUG_ID==1010)]

Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,...,MYCBP,FIS1,IFRD2,NPEPL1,CEBPD,PLEKHM1,MIF,PRAF2,LYN,POLG2
1970540,1010,22RV1,0.98131,3399,2.102708,2.0,1027,37462,GDSC1,1.169802,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826
3551362,1010,22RV1,0.974411,3399,4.032555,2.0,1027,37462,GDSC2,0.542607,...,7.98971,10.102478,7.531275,3.18628,7.831092,5.561236,11.995057,3.86721,5.500169,4.826


__Note__:
- There can be the same row for the same `DRUG_ID`-`CELL_LINE_NAME` tuple with the same `CONC` and `INTENSITY`
- However, this will differ only in the `DATASET` column and one will come from GDSC1, while the other comes from GDSC2

In [41]:
#TODO: Which one do I choose, GDSC1 or GDSC2? 

In [42]:
print(gene_expr[gene_expr.DATASET=='GDSC1'].shape)
print(gene_expr[gene_expr.DATASET=='GDSC2'].shape)

(310946, 922)
(135575, 922)


In [47]:
gene_expr.columns[14:]

Index(['TSPAN6', 'SCYL3', 'BAD', 'LAP3', 'SNX11', 'CASP10', 'CFLAR', 'FKBP4',
       'RBM6', 'SLC25A13',
       ...
       'MYCBP', 'FIS1', 'IFRD2', 'NPEPL1', 'CEBPD', 'PLEKHM1', 'MIF', 'PRAF2',
       'LYN', 'POLG2'],
      dtype='object', length=908)

In [64]:
mut__uniq_cl = np.unique(mutations_all_final_v1.CELL_LINE_NAME.values)
mut__uni_gs = np.unique(mutations_all_final_v1.columns[1:])
gexp__uniq_cl = np.unique(gene_expr.CELL_LINE_NAME.values)
gexp__uniq_gs = np.unique(gene_expr.columns[14:])
cnvg__uniq_cl = np.unique(cnv_gistic.CELL_LINE_NAME.values)
cnvg__uniq_gs = np.unique(cnv_gistic.columns[14:])
cnvp__uniq_cl = np.unique(cnv_picnic.CELL_LINE_NAME.values)
cnvp__uniq_gs = np.unique(cnv_picnic.columns[14:])

print(f"""Unique rows of column
    CELL_LINE_NAME
        mutation   : {len(mut__uniq_cl)}
        gene-expr  : {len(gexp__uniq_cl)}
        cnv-gistic : {len(cnvg__uniq_cl)}
        cnv-picnic : {len(cnvp__uniq_cl)}
    GENE_SYMBOL
        mutation   : {len(mut__uni_gs)}
        gene-expr  : {len(gexp__uniq_gs)}
        cnv-gistic : {len(cnvg__uniq_gs)}
        cnv-picnic : {len(cnvp__uniq_gs)}
""")

# Find intersection of the cell-lines.
intersection_cl = set(mut__uniq_cl).intersection(\
    set(gexp__uniq_cl)).intersection(\
    set(cnvg__uniq_cl)).intersection(\
    set(cnvp__uniq_cl))
# Find intersection of the gene-symbol.
intersection_gs = set(mut__uni_gs).intersection(\
    set(gexp__uniq_gs)).intersection(\
    set(cnvg__uniq_gs)).intersection(\
    set(cnvp__uniq_gs))

print(f"""Size of intersection set for
    CELL_LINE_NAME : {len(intersection_cl):5.0f}
    GENE_SYMBOL    : {len(intersection_gs):5.0f}
""")

Unique rows of column
    CELL_LINE_NAME
        mutation   : 983
        gene-expr  : 988
        cnv-gistic : 988
        cnv-picnic : 988
    GENE_SYMBOL
        mutation   : 956
        gene-expr  : 908
        cnv-gistic : 938
        cnv-picnic : 966

Size of intersection set for
    CELL_LINE_NAME :   983
    GENE_SYMBOL    :   858



In [67]:
list(intersection_gs)[:10]

['FBXL12',
 'PIN1',
 'PAK4',
 'GNA15',
 'ARPP19',
 'EAPP',
 'MOK',
 'MTHFD2',
 'TIPARP',
 'CASP3']

In [76]:
# Create datasets with intersecting CELL_LINE_NAME's and GENE_SYMBOL's only.
print(f"Gene Expr\n{10*'-'}")
gexpr_inter = gene_expr[['DRUG_ID', 'DATASET', 'CELL_LINE_NAME'] + list(intersection_gs)]
gexpr_inter = gexpr_inter[gexpr_inter.CELL_LINE_NAME.isin(list(intersection_cl))]
print(f"Shape total: {gexpr_inter.shape}")
print(f"Shape GDSC1: {gexpr_inter[gexpr_inter.DATASET=='GDSC1'].shape}")
print(f"Shape GDSC2: {gexpr_inter[gexpr_inter.DATASET=='GDSC2'].shape}")

print(f"CNV Gistic\n{10*'-'}")
cnvg_inter = cnv_gistic[['DRUG_ID', 'DATASET', 'CELL_LINE_NAME'] + list(intersection_gs)]
cnvg_inter = cnvg_inter[cnvg_inter.CELL_LINE_NAME.isin(list(intersection_cl))]
print(f"Shape total: {cnvg_inter.shape}")
print(f"Shape GDSC1: {cnvg_inter[cnvg_inter.DATASET=='GDSC1'].shape}")
print(f"Shape GDSC2: {cnvg_inter[cnvg_inter.DATASET=='GDSC2'].shape}")

print(f"CNV Picnic\n{10*'-'}")
cnvp_inter = cnv_picnic[['DRUG_ID', 'DATASET', 'CELL_LINE_NAME'] + list(intersection_gs)]
cnvp_inter = cnvp_inter[cnvp_inter.CELL_LINE_NAME.isin(list(intersection_cl))]
print(f"Shape total: {cnvp_inter.shape}")
print(f"Shape GDSC1: {cnvp_inter[cnvp_inter.DATASET=='GDSC1'].shape}")
print(f"Shape GDSC2: {cnvp_inter[cnvp_inter.DATASET=='GDSC2'].shape}")

print(f"Mutation\n{10*'-'}")
mut_inter = mutations_all_final_v1[['CELL_LINE_NAME'] + list(intersection_gs)]
mut_inter = mut_inter[mut_inter.CELL_LINE_NAME.isin(list(intersection_cl))]
print(f"Shape total: {mut_inter.shape}")

Gene Expr
----------
Shape total: (444580, 861)
Shape GDSC1: (309541, 861)
Shape GDSC2: (135039, 861)
CNV Gistic
----------
Shape total: (444580, 861)
Shape GDSC1: (309541, 861)
Shape GDSC2: (135039, 861)
CNV Picnic
----------
Shape total: (444580, 861)
Shape GDSC1: (309541, 861)
Shape GDSC2: (135039, 861)
Mutation
----------
Shape total: (983, 859)


We don't have `DRUG_ID` information in the mutations dataset. Which we also don't need. We can sparse down the other features (gene expr, cnv) datasets in the sense that we only have the unique intersecting `CELL_LINE_NAME`'s as rows, the unique intersecting `GENE_SYMBOL`'s as columns, and as cells the value per cell-line-gene tuple. Thus, it should be the same shape as for the mutations dataset.

## Preprocess feature datasets

- [ ] Sparse down the gene expression, cnv gistic and cnv picnic datasets to the same shape as the final mutations dataset from above
- [ ] Save the drug-cell-line tuple dataset in a separate file

### Feature dataset

#### Sparse feature sets

In [77]:
cnvp_inter.head(5)

Unnamed: 0,DRUG_ID,DATASET,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,1,GDSC1,MC-CAR,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
9,1,GDSC1,ES3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,2.0,3.0
27,1,GDSC1,ES5,4.0,4.0,3.0,4.0,4.0,2.0,2.0,...,2.0,4.0,3.0,3.0,5.0,4.0,5.0,3.0,3.0,4.0
45,1,GDSC1,ES7,3.0,3.0,3.0,3.0,3.0,2.0,3.0,...,3.0,3.0,3.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0
54,1,GDSC1,EW-11,3.0,3.0,3.0,3.0,3.0,2.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0


In [87]:
gexpr_sparse = gexpr_inter.groupby('CELL_LINE_NAME').first().reset_index()
gexpr_sparse = gexpr_sparse.loc[:, ~gexpr_sparse.columns.isin(['DRUG_ID', 'DATASET'])]

cnvg_sparse = cnvg_inter.groupby('CELL_LINE_NAME').first().reset_index()
cnvg_sparse = cnvg_sparse.loc[:, ~cnvg_sparse.columns.isin(['DRUG_ID', 'DATASET'])]

cnvp_sparse = cnvp_inter.groupby('CELL_LINE_NAME').first().reset_index()
cnvp_sparse = cnvp_sparse.loc[:, ~cnvp_sparse.columns.isin(['DRUG_ID', 'DATASET'])]

gexpr_sparse.head(5)

Unnamed: 0,CELL_LINE_NAME,FBXL12,PIN1,PAK4,GNA15,ARPP19,EAPP,MOK,MTHFD2,TIPARP,...,PDHX,DFFB,FOSL1,ETS1,EBNA1BP2,MYL9,MLLT11,PFKL,FGFR4,SDHB
0,22RV1,7.023759,6.067534,4.31875,3.261427,6.297582,8.313991,5.514912,10.594112,5.222366,...,7.821536,3.601622,3.225596,3.651201,7.895763,3.953414,4.059382,4.376822,3.215209,9.267565
1,23132-87,6.714387,5.695096,4.536146,3.295886,7.021037,8.50008,4.862145,10.609245,6.528668,...,8.094289,3.596762,3.486299,3.127452,7.852436,3.869411,4.248318,4.989945,4.328643,9.51587
2,42-MG-BA,7.752402,5.475753,4.033714,3.176525,7.279671,8.013367,4.957332,11.266705,7.445954,...,7.984052,3.317746,5.106906,5.305024,6.508066,7.840349,8.632889,4.792137,3.078971,8.495921
3,451Lu,6.518083,5.46252,4.436039,3.241076,6.657767,7.085021,8.199066,10.550334,5.83756,...,6.970153,3.69546,4.947532,5.198087,7.361412,3.687171,5.965388,4.885217,3.849932,9.726323
4,5637,6.855088,5.980778,4.382524,6.086206,7.423409,8.12018,5.212472,10.329122,7.212325,...,7.068021,3.386432,6.835862,6.190242,7.431739,3.652418,7.088577,4.76185,3.163742,9.091481


In [90]:
assert gexpr_sparse.shape == cnvg_sparse.shape == cnvp_sparse.shape == mut_inter.shape
print(f"""Feature dataset shapes
    Gene Expression : {gexpr_sparse.shape}
    CNV Gistic      : {cnvg_sparse.shape}
    CNV Picnic      : {cnvp_sparse.shape}
    Mutation        : {mut_inter.shape}
""")

Feature dataset shapes
    Gene Expression : (983, 859)
    CNV Gistic      : (983, 859)
    CNV Picnic      : (983, 859)
    Mutation        : (983, 859)



- We have now a final dataset per feature with cell-lines as rows and gene symbols as columns which are existent in all datasets.
- Also only one row per cell-line was kept, since for the same cell-line the rows only different because of different drugs
- We can save these datasets and use them for model building 
  - However, we need to save also the `LN_IC50` values for the cell-line - drug tuples
    - NOTE that the gene_symbols are only a feature of the genes and independent of the drug, thus they always have the same value for a cell-line, even for different drugs
- NOTE that some cell-line-gene-symbol tuples are `NaN`. That just means that the database had no feature value for the specific gene in the specific cell-line 

#### Save sparsed features

In [102]:
%mkdir ../../datasets/datasets_for_model_building

In [105]:
PATH_TO_FEATURES = '../../datasets/datasets_for_model_building/'

gexpr_sparse.to_pickle(f'{PATH_TO_FEATURES}gexpr_sparse.pkl')
cnvg_sparse.to_pickle(f'{PATH_TO_FEATURES}cnvg_sparse.pkl')
cnvp_sparse.to_pickle(f'{PATH_TO_FEATURES}cnvp_sparse.pkl')
mut_inter.to_pickle(f'{PATH_TO_FEATURES}mut_sparse.pkl')

In [None]:
READ = False

if READ:
    with open(f'{PATH_TO_FEATURES}gexpr_sparse.pkl', 'rb') as f: 
        gexpr_sparse = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}cnvg_sparse.pkl', 'rb') as f: 
        cnvg_sparse = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}cnvp_sparse.pkl', 'rb') as f: 
        cnvp_sparse = pickle.load(f)
    with open(f'{PATH_TO_FEATURES}mut_sparse.pkl', 'rb') as f: 
        mut_sparse = pickle.load(f)                       

### Drug dataset

#### Get drug dataset

- We need drug, cell-line and IC50 information.

In [140]:
drug_cl_v1 = gene_expr[['CELL_LINE_NAME', 'DRUG_ID', 'DRUG_NAME', 'DATASET', 'LN_IC50']].sort_values(['CELL_LINE_NAME', 'DRUG_ID'])
print(f"Shape: {drug_cl_v1.shape}")
drug_cl_v1.head(50)

Shape: (446521, 5)


Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
190089,201T,133,Doxorubicin,GDSC1,-3.770673
198783,201T,134,Etoposide,GDSC1,-0.81418
207405,201T,135,Gemcitabine,GDSC1,-0.29805
216171,201T,136,Mitomycin-C,GDSC1,-4.472378
224883,201T,140,Vinorelbine,GDSC1,-5.332884
233550,201T,147,NSC-87877,GDSC1,4.680281
242370,201T,150,Bicalutamide,GDSC1,2.754322
251217,201T,151,QS11,GDSC1,1.99259
261873,201T,152,CP466722,GDSC1,2.29966
271251,201T,153,Midostaurin,GDSC1,-1.83753


In [141]:
drug_cl_v1.groupby(['CELL_LINE_NAME', 'DRUG_ID', 'DRUG_NAME']).size().reset_index()\
    .rename(columns={0: 'count'}).sort_values(['count'], ascending=False).head(10)

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,count
118673,HCC1937,1192,GSK269962A,6
40457,CAMA-1,1192,GSK269962A,4
389675,T-T,1022,AZD7762,4
12105,A549,1017,Olaparib,4
28933,BT-483,1038,NU7441,4
208495,MDA-MB-330,1017,Olaparib,4
208021,MDA-MB-231,1011,Navitoclax,4
308862,OVKATE,1046,Wee1 Inhibitor,4
108257,GRANTA-519,1862,MG-132,4
76330,Daudi,1032,Afatinib,4


In [142]:
drug_cl_v1[(drug_cl_v1.CELL_LINE_NAME=='HCC1937') & (drug_cl_v1.DRUG_ID==1192)]

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
2846209,HCC1937,1192,GSK269962A,GDSC1,3.810152
2846216,HCC1937,1192,GSK269962A,GDSC1,3.810152
2846244,HCC1937,1192,GSK269962A,GDSC1,3.810152
4344253,HCC1937,1192,GSK269962A,GDSC2,3.781019
4344260,HCC1937,1192,GSK269962A,GDSC2,3.781019
4344288,HCC1937,1192,GSK269962A,GDSC2,3.781019


In [143]:
print(drug_cl_v1[drug_cl_v1.DATASET=='GDSC1'].shape)
print(drug_cl_v1[drug_cl_v1.DATASET=='GDSC2'].shape)

(310946, 5)
(135575, 5)


- NOTE: if there is a same cell-line drug combination the ic50 values only differ for different datasets
  - if for the same cell-line drug tuple the dataset is e.g. GDSC1, then the ic50 values will also be the same
- Thus, we need to decide which dataset we choose
- Since GDSC1 provides more data points, we will go on with GDSC1

In [144]:
drug_cl_GDSC1 = drug_cl_v1[drug_cl_v1.DATASET=='GDSC1'].drop_duplicates()
print(drug_cl_GDSC1.shape)

drug_cl_GDSC2 = drug_cl_v1[drug_cl_v1.DATASET=='GDSC2'].drop_duplicates()
print(drug_cl_GDSC2.shape)

(310904, 5)
(135242, 5)


In [133]:
# Check if the duplicated rows from above got removed.
drug_cl_GDSC1[(drug_cl_GDSC1.CELL_LINE_NAME=='HCC1937') & (drug_cl_GDSC1.DRUG_ID==1192)]

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DATASET,LN_IC50
2846209,HCC1937,1192,GDSC1,3.810152


In [145]:
drug_cl_GDSC2[(drug_cl_GDSC2.CELL_LINE_NAME=='HCC1937') & (drug_cl_GDSC2.DRUG_ID==1192)]

Unnamed: 0,CELL_LINE_NAME,DRUG_ID,DRUG_NAME,DATASET,LN_IC50
4344253,HCC1937,1192,GSK269962A,GDSC2,3.781019


In [150]:
drug_cl_GDSC2[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_ID']).nunique().sort_values(['DRUG_NAME'], ascending=False).head(10)

Unnamed: 0_level_0,DRUG_NAME
DRUG_ID,Unnamed: 1_level_1
1,1
1039,1
1037,1
1036,1
1033,1
1032,1
1031,1
1030,1
1029,1
1028,1


In [152]:
drug_cl_GDSC2[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_NAME']).nunique().sort_values(['DRUG_ID'], ascending=False).head(10)

Unnamed: 0_level_0,DRUG_ID
DRUG_NAME,Unnamed: 1_level_1
Dactinomycin,2
Fulvestrant,2
Oxaliplatin,2
Docetaxel,2
Ulixertinib,2
Uprosertib,2
PD173074,1
Olaparib,1
Osimertinib,1
P22077,1


These duplicates need to be removed. We need a 1-to-1 mapping from the `DRUG_NAME`'s to the `DRUG_ID`'s. 

In [161]:
all_non_uniqs = drug_cl_GDSC2[['DRUG_ID', 'DRUG_NAME']].groupby(['DRUG_NAME']).nunique().sort_values(['DRUG_ID'], ascending=False)
all_non_uniqs = all_non_uniqs[all_non_uniqs.DRUG_ID>1]

for non_unique in all_non_uniqs.index:
    print(non_unique)
    print(drug_cl_GDSC2[drug_cl_GDSC2.DRUG_NAME==non_unique]['DRUG_ID'].value_counts())

Dactinomycin
1911    740
1811    728
Name: DRUG_ID, dtype: int64
Fulvestrant
1200    764
1816    728
Name: DRUG_ID, dtype: int64
Oxaliplatin
1089    802
1806    728
Name: DRUG_ID, dtype: int64
Docetaxel
1007    766
1819    669
Name: DRUG_ID, dtype: int64
Ulixertinib
1908    752
2047    749
Name: DRUG_ID, dtype: int64
Uprosertib
2106    745
1553    735
Name: DRUG_ID, dtype: int64


- This is now the drug dataset which can be encoded by the drug encode side of the bi-modal network

#### Save drug dataset

In [149]:
PATH_TO_FEATURES = '../../datasets/datasets_for_model_building/'

drug_cl_GDSC1.to_pickle(f'{PATH_TO_FEATURES}drugs_sparse_gdsc1.pkl')
drug_cl_GDSC2.to_pickle(f'{PATH_TO_FEATURES}drugs_sparse_gdsc2.pkl')

In [147]:
READ = False

if READ:
    with open(f'{PATH_TO_FEATURES}drugs_sparse_gdsc1.pkl', 'rb') as f: 
        drug_cl_GDSC1 = pickle.load(f) 
    with open(f'{PATH_TO_FEATURES}drugs_sparse_gdsc2.pkl', 'rb') as f: 
        drug_cl_GDSC2 = pickle.load(f)         

---

## Classifying the `protein_mutation` column

This will be a different approach compared to checking the `cancer_driver` column. Now we will classify the `protein_mutation` column.

In [60]:
def classify_protein_mutation_col(protein_mut):
    # This method is adapted from MOLI.
    # https://academic.oup.com/bioinformatics/article/35/14/i501/5529255
    mut = protein_mut.split('p.')[1]
    if bool(re.match(re.compile("^[A-Z][0-9]+[A-Z]$"), mut)): return "missense"
    elif bool(re.match(re.compile("^[A-Z]+[0-9]+[A-Z]+$"), mut)): return "inframe_indel"
    elif "*" in mut: return "stop_gain_or_fs"
    elif "?" in mut: return "ess_splice"
    elif bool(re.match(re.compile("^\-[0-9]+[A-Z]+$"), mut)): return "unrecognized_point"
    elif bool(re.match(re.compile("^\-[0-9]+$"), mut)): return "unrecognized_truncating"
    elif bool(re.match(re.compile("^[A-Z]+[0-9]+\-$"), mut)): return "unrecognized_point2"
    else: return "unrecognized"    

In [62]:
mutations_v7 = mutations_v6[mutations_v6.protein_mutation_pos != (None, None)][
    ['CELL_LINE_NAME', 'gene_symbol', 'protein_mutation', 'cancer_driver', 'vaf', 'protein_mutation_pos']
]

mutations_v7['protein_mutation_group'] = mutations_v7['protein_mutation'].apply(classify_protein_mutation_col)
mutations_v7.groupby(['protein_mutation_group']).size().sort_values(ascending=False)

print("We removed the rows which have a missing protein_mutation_position and classified the protein_mutation column.")
print(f"mutations_v7 Shape: {mutations_v7.shape}")
mutations_v7.head(5)

We removed the rows which have a missing protein_mutation_position and classified the protein_mutation column.
mutations_v7 Shape: (44952, 7)


Unnamed: 0,CELL_LINE_NAME,gene_symbol,protein_mutation,cancer_driver,vaf,protein_mutation_pos,protein_mutation_group
10,NUGC-4,VGLL4,p.A95S,False,0.4138,"(95, 95)",missense
19,Daudi,VGLL4,p.S282T,False,0.1212,"(282, 282)",missense
63,NY,VGLL4,p.V80I,False,0.4706,"(80, 80)",missense
75,HSC-3,VGLL4,p.E105D,False,0.9444,"(105, 105)",missense
81,CW-2,VGLL4,p.G276fs*99,False,0.6444,"(276, 99)",stop_gain_or_fs


In [63]:
mutations_v7[mutations_v7.CELL_LINE_NAME == 'NUGC-4']

Unnamed: 0,CELL_LINE_NAME,gene_symbol,protein_mutation,cancer_driver,vaf,protein_mutation_pos,protein_mutation_group
10,NUGC-4,VGLL4,p.A95S,False,0.4138,"(95, 95)",missense
243,NUGC-4,VGLL4,p.A95S,False,0.2308,"(95, 95)",missense
423,NUGC-4,PLA2G4A,p.Q513K,False,0.3427,"(513, 513)",missense
789,NUGC-4,PLA2G4A,p.Q513K,False,0.5354,"(513, 513)",missense
9359,NUGC-4,BRCA1,p.I456T,False,0.8032,"(456, 456)",missense
...,...,...,...,...,...,...,...
224245,NUGC-4,SPR,p.N154Y,False,0.1677,"(154, 154)",missense
227101,NUGC-4,KIF20A,p.K827Q,False,0.5000,"(827, 827)",missense
227167,NUGC-4,KIF20A,p.K827Q,False,0.4468,"(827, 827)",missense
231058,NUGC-4,TMEM109,p.R37Q,False,0.6159,"(37, 37)",missense
