In [3]:
import sys, os
sys.path.append("../SVLDRP")

In [4]:
import pandas as pd
from GDSC_cell_line_similarity_extractor import get_gdsc_methylation_similarity, get_gdsc_copy_number_var_similarity, get_gdsc_mutation_similarity, get_gdsc_gene_expression_similarity, cosmic_ids_to_cell_line_names


In [5]:
copy_number_data = get_gdsc_copy_number_var_similarity(kernel=None, path_cnv='../SVLDRP/data/GDSC/cnv_gistic_20191101.csv').T
mutation_data = get_gdsc_mutation_similarity(kernel=None, path_mutations="../SVLDRP/data/GDSC/mutations_20191101.csv")
methylation_data = get_gdsc_methylation_similarity(kernel=None, path_methylation= "../SVLDRP/data/GDSC/F2_METH_CELL_DATA.txt",
                                                       path_methylation_annotations="../SVLDRP/data/GDSC/methSampleId_2_cosmicIds.xlsx")



In [6]:

path_cell_annotations = "../SVLDRP/data/GDSC/Cell_Lines_Details.csv"
path_gene_expression = "../SVLDRP/data/GDSC/Cell_line_RMA_proc_basalExp.txt"


gene_expression = pd.read_csv(path_gene_expression, sep="\t")

gene_expression = gene_expression.drop(["GENE_title"], axis=1).set_index(
    "GENE_SYMBOLS"
)
gene_expression.index = gene_expression.index.astype(str)

# refactor column names to cosmic id and then map to cell-line name
ge_columns = [
    x.split("DATA.")[1] for x in list(gene_expression.columns)
]  # remove "DATA" prefix
ge_columns = cosmic_ids_to_cell_line_names(
    ge_columns, path_cell_annotations=path_cell_annotations
)
gene_expression.columns = ge_columns.astype(str)

gene_expression =gene_expression.T

Note: 50 Cosmic IDs not found in cell annotation data: 
['906815', '1330955', '907284', '1330944', '1330943', '1298232', '1299076', '1299051', '906829', '1290773', '907049', '909730', '910946', '907391', '687509', '1327761', '753536', '1240152', '925342', '687814', '1479994', '1479991', '1298355', '1331026', '1298150', '1479993', '11223344', '1330945', '1479992', '998179', '1331060', '1331028', '1503362.1', '906850', '1240211', '1299049', '908119', '1331031', '907785', '1330983.1', '906867', '1723793', '1723794', '1240156', '1299067', '906803', '1298154', '909976.1', '1659787', '905954.1']


In [7]:
gene_expression_data = gene_expression[[(not i.startswith("unknown")) for i in gene_expression.index]]

In [8]:
cell_line_annotations = pd.read_excel('../SVLDRP/data/GDSC/Cell_Lines_Details.xlsx', engine='openpyxl').set_index("Sample Name").drop("TOTAL:")

cell_line_to_tissue1 = {str(cell_line): tissue for cell_line, tissue in zip(cell_line_annotations.index,
                                                                            cell_line_annotations['GDSC\nTissue descriptor 1'])}
cell_line_to_tissue2 = {str(cell_line): tissue for cell_line, tissue in zip(cell_line_annotations.index,
                                                                            cell_line_annotations['GDSC\nTissue\ndescriptor 2'])}
def robust_cell_line_to_tissue_map(cl, cell_line_to_tissue):
    if cl in cell_line_to_tissue:
        return cell_line_to_tissue[cl]
    else:
        assert not (str(cl) in cell_line_to_tissue), f"{cl} is int, should be str"
        return "unknown"
    


In [7]:
all_cells = list(set(gene_expression_data.index) | set(copy_number_data.index)| set(mutation_data.index) | set(methylation_data.index))
all_tissues1 = [robust_cell_line_to_tissue_map(cl, cell_line_to_tissue1) for cl in all_cells]
all_tissues2 = [robust_cell_line_to_tissue_map(cl, cell_line_to_tissue2) for cl in all_cells]

In [8]:
cell_line_annotations = pd.DataFrame({"cell_line_name": all_cells, "tissue_descriptor1": all_tissues1, "tissue_descriptor2": all_tissues2})

In [9]:
mutation_data.index = mutation_data.index.astype(str)
methylation_data.index = methylation_data.index.astype(str)
copy_number_data.index = copy_number_data.index.astype(str)
gene_expression_data.index = gene_expression_data.index.astype(str)

mutation_data.columns = mutation_data.columns.astype(str)
methylation_data.columns = methylation_data.columns.astype(str)
copy_number_data.columns = copy_number_data.columns.astype(str)
gene_expression_data.columns = gene_expression_data.columns.astype(str)

In [10]:
data_gdsc1 = pd.read_csv('../SVLDRP/data/GDSC/GDSC1_fitted_dose_response_25Feb20.csv')
data_gdsc2 = pd.read_csv('../SVLDRP/data/GDSC/GDSC2_fitted_dose_response_25Feb20.csv')

data_gdsc1 = data_gdsc1.loc[:,["LN_IC50", "CELL_LINE_NAME", "DRUG_NAME", "PATHWAY_NAME"]].rename({"PATHWAY_NAME": "DRUG_TARGET_PATHWAY"}, axis=1)
data_gdsc2 = data_gdsc2.loc[:,["LN_IC50", "CELL_LINE_NAME", "DRUG_NAME", "PATHWAY_NAME"]].rename({"PATHWAY_NAME": "DRUG_TARGET_PATHWAY"}, axis=1)

In [11]:
data_gdsc1 = pd.merge(
        left=data_gdsc1,
        right=cell_line_annotations,
        right_on="cell_line_name",
        left_on="CELL_LINE_NAME",
        how="left",
    ).drop("cell_line_name", axis=1).rename({"tissue_descriptor1": "CELL_LINE_TISSUE_1", "tissue_descriptor2": "CELL_LINE_TISSUE_2"}, axis=1)

data_gdsc2 = pd.merge(
        left=data_gdsc2,
        right=cell_line_annotations,
        right_on="cell_line_name",
        left_on="CELL_LINE_NAME",
        how="left",
    ).drop("cell_line_name", axis=1).rename({"tissue_descriptor1": "CELL_LINE_TISSUE_1", "tissue_descriptor2": "CELL_LINE_TISSUE_2"}, axis=1)

In [12]:
# get GDSC drug properties
#drug_data = pd.read_csv("../SVLDRP/data/GDSC/screened_compunds_rel_8.2.csv")


In [15]:
methylation_data

Unnamed: 0,chr1:10003165-10003585,chr1:100315420-100316009,chr1:100435297-100436070,chr1:100503482-100504404,chr1:10057121-10058108,chr1:100598130-100598754,chr1:100731632-100731999,chr1:100817709-100818899,chr1:10092532-10093404,chr1:101004471-101005885,...,chr9:99179479-99181723,chr9:99212164-99213507,chr9:99328810-99329764,chr9:99380814-99382236,chr9:99416904-99417592,chr9:99540025-99540472,chr9:99616401-99616940,chr9:99775098-99776102,chr9:99800856-99801724,chr9:99962136-99962426
MC-CAR,0.192212,0.203820,0.277914,0.190930,0.544059,0.273826,0.246309,0.132243,0.265601,0.552313,...,0.161748,0.140332,0.171117,0.504871,0.593335,0.295142,0.396794,0.264053,0.301407,0.208931
PFSK-1,0.187603,0.207652,0.400146,0.195871,0.764898,0.265925,0.245317,0.081613,0.246907,0.782697,...,0.156671,0.179353,0.665262,0.202245,0.553815,0.223422,0.401027,0.311693,0.290512,0.155249
A673,0.210185,0.222116,0.264730,0.243298,0.415485,0.308657,0.258123,0.163548,0.281540,0.459978,...,0.161361,0.197860,0.216423,0.431386,0.546775,0.242035,0.446645,0.333607,0.319258,0.307825
ES3,0.205614,0.227570,0.303641,0.250454,0.599275,0.284652,0.255257,0.183600,0.249813,0.346111,...,0.139215,0.200593,0.204809,0.360808,0.537972,0.255180,0.513135,0.352037,0.317108,0.267645
ES5,0.204037,0.204111,0.243705,0.227564,0.389168,0.279121,0.235306,0.144075,0.249098,0.300132,...,0.147075,0.205460,0.449984,0.340790,0.469317,0.257921,0.528488,0.337319,0.280758,0.192772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MO,0.202672,0.240531,0.293938,0.195461,0.623589,0.285769,0.249944,0.143821,0.246995,0.490227,...,0.129995,0.125894,0.099382,0.348599,0.524072,0.634188,0.432816,0.284111,0.429171,0.467794
SNU-5,0.196493,0.187575,0.148768,0.187911,0.427652,0.248952,0.264237,0.126252,0.181884,0.705789,...,0.102420,0.190474,0.119517,0.224722,0.572469,0.257184,0.315729,0.302024,0.284147,0.308154
MINO,0.201091,0.171479,0.236005,0.203405,0.462792,0.280228,0.246454,0.130319,0.230951,0.534987,...,0.158141,0.092234,0.104812,0.260832,0.518584,0.292160,0.287092,0.245871,0.643805,0.314371
SNU-1,0.210533,0.187861,0.278650,0.256763,0.892354,0.284736,0.253735,0.154238,0.254106,0.704193,...,0.180994,0.183813,0.604159,0.427760,0.572392,0.275785,0.512064,0.331516,0.353752,0.513197


In [14]:
data_gdsc1.to_csv("data/response_GDSC1.csv")
data_gdsc2.to_csv("data/response_GDSC2.csv")
copy_number_data.to_csv("data/copy_number_variation_gistic.csv")
gene_expression_data.to_csv("data/gene_expression.csv")
mutation_data.to_csv("data/mutations.csv")
methylation_data.to_csv("data/methylation.csv")

