In [1]:
import pandas as pd
import numpy as np
import pickle

# 处理CCLE数据

In [2]:
drug_smiles = pd.read_csv('./data/drug_smiles.csv')
drug_smiles.rename(columns={'name':'Drug name'}, inplace=True)
ic_50_df = pd.read_csv("./data/PANCANCER_IC.csv")
ic_50_df['Cell line name'] = ic_50_df['Cell line name'].apply(lambda x:x.replace("-", '').replace(' ','').replace('.', '').upper())
cell_info = pd.read_csv("./data/sample_info.csv")
cell_info.rename(columns={'stripped_cell_line_name':'Cell line name'}, inplace=True)
ic_50_df = ic_50_df.merge(cell_info[['DepMap_ID', 'Cell line name']], on='Cell line name', how='left')
ic_50_df = ic_50_df[(~ic_50_df['DepMap_ID'].isnull()) & (~ic_50_df['Drug name'].isnull())]
node_drug = ic_50_df[['Drug name']].drop_duplicates().sort_values('Drug name')
node_cell = ic_50_df[['DepMap_ID']].drop_duplicates().sort_values('DepMap_ID')

In [3]:
cell_exp = pd.read_csv("./data/CCLE_expression.csv", index_col=0)
cell_exp.columns = pd.Series(cell_exp.columns).apply(lambda x:x.split(' ')[1].strip('()'))
cell_mu = pd.read_csv("./data/CCLE_mutations.csv")
cell_mu['Entrez_Gene_Id'] = cell_mu['Entrez_Gene_Id'].astype('str')

  interactivity=interactivity, compiler=compiler, result=result)


# 突变方案

In [5]:
cell_mu['is_muted'] = 1
cell_mu = pd.pivot_table(cell_mu, columns=['Entrez_Gene_Id'], index=['DepMap_ID'], values=['is_muted'], aggfunc=np.mean, fill_value=0)
cell_mu.columns = cell_mu.columns.get_level_values(1)
cell_mu.columns.name = None
cell_mu.index.name = 'DepMap_ID'
cell_exp.index.name = 'DepMap_ID'
cell_mu = cell_mu.sort_values('DepMap_ID')
cell_exp = cell_exp.sort_values('DepMap_ID')

# GDSC

In [6]:
cell_index = list(cell_exp.index&cell_mu.index&node_cell['DepMap_ID'])
drug_index = drug_smiles['Drug name']
node_cell = node_cell[node_cell['DepMap_ID'].isin(cell_index)]
node_drug = node_drug[node_drug['Drug name'].isin(drug_index)]
cell_exp = cell_exp.loc[cell_index]
cell_mu = cell_mu.loc[cell_index]
ic_50_df = ic_50_df[ic_50_df['Drug name'].isin(drug_index) & ic_50_df['DepMap_ID'].isin(cell_index)]

In [7]:
node_drug.loc[:, 'drug_node_id'] = range(len(node_drug))
node_cell.loc[:, 'cell_node_id'] = range(len(node_cell))
ic_50_df = ic_50_df.merge(node_drug, on='Drug name', how='left').merge(node_cell, on='DepMap_ID', how='left')

In [8]:
multi_task_df = pd.pivot_table(ic_50_df, columns=['Drug name'], index=['DepMap_ID'], values=['IC50'], aggfunc=np.sum, fill_value=None)
multi_task_df.columns = multi_task_df.columns.get_level_values(1)
multi_task_df.columns.name = None

In [9]:
cell_exp = node_cell.merge(cell_exp, on='DepMap_ID', how='left')
cell_mu = node_cell.merge(cell_mu, on='DepMap_ID', how='left')

In [10]:
census = pd.read_csv("./data/Census_all.csv")
census = census[census['Entrez GeneId'].notna()]
census['Entrez GeneId'] = census['Entrez GeneId'].astype('int').astype('str')
gene_list = list(set(census['Entrez GeneId'])&set(cell_exp.columns)&set(cell_mu.columns))

In [11]:
index = ['DepMap_ID', 'cell_node_id'] + gene_list
cell_exp = cell_exp[index].T[~cell_exp[index].columns.duplicated(keep='first')].T
cell_mu = cell_mu[index]

In [12]:
cell_info = pd.read_csv("./data/CCLE_classification.csv")

In [13]:
cell_info.rename(columns={'stripped_cell_line_name':'Cell line name'}, inplace=True)

In [14]:
cell_info.rename(columns={'cell_type':'cancer_type'}, inplace=True)

In [15]:
cell_exp = pd.merge(cell_info[['DepMap_ID', 'cancer_type']], cell_exp, how='right', left_on='DepMap_ID', right_on='DepMap_ID')
cell_mu = pd.merge(cell_info[['DepMap_ID', 'cancer_type']], cell_mu, how='right', left_on='DepMap_ID', right_on='DepMap_ID')
cell_exp = cell_exp[cell_exp.groupby('cancer_type')['cancer_type'].transform('size')>=10]
cell_mu = cell_mu[cell_mu.groupby('cancer_type')['cancer_type'].transform('size')>=10]
cancer_type_idx = cell_exp[['cancer_type']].dropna().drop_duplicates().sort_values('cancer_type')
cancer_type_idx['cancer_type_idx'] = range(len(cancer_type_idx))
cell_exp = pd.merge(cancer_type_idx, cell_exp, how='right', left_on='cancer_type', right_on='cancer_type')
cell_mu = pd.merge(cancer_type_idx, cell_mu, how='right', left_on='cancer_type', right_on='cancer_type')

In [16]:
multi_task_df = multi_task_df.loc[cell_exp.DepMap_ID]
ic_50_df = ic_50_df[ic_50_df.DepMap_ID.isin(cell_exp.DepMap_ID)]

In [19]:
drug_smiles = drug_smiles[drug_smiles['Drug name'].isin(multi_task_df.columns)]
drug_smiles = drug_smiles.drop_duplicates(subset='Drug name')
drug_smiles = drug_smiles.reset_index()

In [38]:
gdsc_gene_list =  cell_exp.columns[4:].tolist()

In [77]:
final_gene_list = list(set(gdsc_gene_list)&set(tcga_gene_list))
final_gene_list.sort()

In [79]:
gdsc_final_gene_list =  ['cancer_type', 'cancer_type_idx', 'DepMap_ID', 'cell_node_id'] + final_gene_list

In [80]:
cell_exp = cell_exp[gdsc_final_gene_list]
cell_mu = cell_mu[gdsc_final_gene_list]

In [81]:
with open("./data/GDSC_dataset.dict", "wb") as f:
    pickle.dump({'ic50':ic_50_df, 'multi_task_df':multi_task_df, 'cell_exp':cell_exp, 'cell_mu':cell_mu, 'drug_smiles':drug_smiles}, f)

# TCGA

In [28]:
TCGA_exp = pd.read_csv('data/TCGA_exp.csv')
TCGA_mu = pd.read_csv('data/TCGA_mu.csv')

In [40]:
tcga_gene_list = TCGA_exp.columns.tolist()[2:]

In [43]:
tcga_final_gene_list = ['sample', 'cancer_type', ] + final_gene_list

In [44]:
TCGA_exp = TCGA_exp[tcga_final_gene_list]
TCGA_mu = TCGA_mu[tcga_final_gene_list]

In [45]:
TCGA_exp = TCGA_exp.loc[:,~TCGA_exp.columns.duplicated()]
TCGA_mu = TCGA_mu.loc[:,~TCGA_mu.columns.duplicated()]

In [47]:
TCGA_exp = TCGA_exp.dropna(axis=0)
TCGA_mu = TCGA_mu.dropna(axis=0)

# 处理TCGA临床数据集

# 联合用药的拆开

In [49]:
sample = pd.read_excel('data/TCGA_DRP.xlsx', engine='openpyxl')

In [50]:
sample = sample[sample['measure_of_response'].isin(['Complete Response','Partial Response'])]

In [51]:
def make_drug_list(df):
    return list(df['Drug name'])

In [52]:
sample = sample.groupby('sample').apply(lambda x: make_drug_list(x)).reset_index().rename(columns={0: 'drug'})

In [53]:
sample.to_csv('data/TCGA_DRP.csv')

In [54]:
TCGA_Sample_Name = [x for x in  TCGA_exp['sample'].tolist() if x[-2] == '0']
TCGA_exp_01 = TCGA_exp[TCGA_exp['sample'].isin(TCGA_Sample_Name)]
TCGA_exp_01['sample'] = [x[:-3] for x in TCGA_exp_01['sample']]
TCGA_mu_01 = TCGA_mu[TCGA_mu['sample'].isin(TCGA_Sample_Name)]
TCGA_mu_01['sample'] = [x[:-3] for x in TCGA_mu_01['sample']]

In [55]:
sample_name = list(set(sample['sample'])&set(TCGA_exp_01['sample']))
sample = sample[sample['sample'].isin(sample_name)]

In [56]:
TCGA_exp_01 = TCGA_exp_01[TCGA_exp_01['sample'].isin(sample_name)]
TCGA_mu_01 = TCGA_mu_01[TCGA_mu_01['sample'].isin(sample_name)]
sample = sample.sort_values('sample')

In [57]:
TCGA_exp_01 = TCGA_exp_01.sort_values('sample')
TCGA_exp_01 = TCGA_exp_01.drop_duplicates(subset='sample')
TCGA_mu_01 = TCGA_mu_01.sort_values('sample')
TCGA_mu_01 = TCGA_mu_01.drop_duplicates(subset='sample')

In [58]:
TCGA_exp_01 = TCGA_exp_01.reset_index(drop=True)
TCGA_mu_01 = TCGA_mu_01.reset_index(drop=True)
sample = sample.reset_index(drop=True)

In [59]:
sample['cancer_type'] = TCGA_exp_01['cancer_type']

In [60]:
TCGA_exp_01 = TCGA_exp_01[TCGA_exp_01.groupby('cancer_type')['cancer_type'].transform('size')>=10]
TCGA_exp_01 = TCGA_exp_01.reset_index(drop=True)

In [61]:
TCGA_mu_01 = TCGA_mu_01[TCGA_mu_01.groupby('cancer_type')['cancer_type'].transform('size')>=10]
TCGA_mu_01 = TCGA_mu_01.reset_index(drop=True)

In [62]:
sample = sample[sample.groupby('cancer_type')['cancer_type'].transform('size')>=10]

In [63]:
sample = sample.reset_index(drop=True)

In [64]:
from itertools import chain
num_drug = list(set(list(chain.from_iterable(sample['drug'].values))))

In [65]:
response = []
for drugs in sample['drug'].tolist():
    response.append([1 if x in drugs else 0 for x in num_drug])

In [66]:
sample[num_drug] = response

In [67]:
del sample['drug']

In [71]:
with open("./data/TCGA_dataset.dict", "wb") as f:
    pickle.dump({'TCGA_DRP':sample, 'cell_exp':TCGA_exp_01, 'cell_mu':TCGA_mu_01}, f)

# 处理GDSC药物

In [None]:
import csv
from pubchempy import *
folder = './'
def write_drug_cid():
    drugs = node_drug['Drug name'].tolist()
    drug_id = []
    datas = []
    outputfile = open(folder + 'pychem_cid.csv', 'w')
    wr = csv.writer(outputfile)
    unknow_drug = []
    for drug in drugs:
        c = get_compounds(drug, 'name')
        if drug.isdigit():
            cid = int(drug)
        elif len(c) == 0:
            unknow_drug.append(drug)
            continue
        else:
            cid = c[0].cid
        print(drug, cid)
        drug_id.append(cid)
        row = [drug, str(cid)]
        wr.writerow(row)
    outputfile.close()
    outputfile = open(folder + "unknow_drug_by_pychem.csv", 'w')
    wr = csv.writer(outputfile)
    wr.writerow(unknow_drug)

def cid_from_other_source():
    """
    some drug can not be found in pychem, so I try to find some cid manually.
    the small_molecule.csv is downloaded from http://lincs.hms.harvard.edu/db/sm/
    """
    f = open(folder + "small_molecule.csv", 'r')
    reader = csv.reader(f)
    next(reader)
    cid_dict = {}
    for item in reader:
        name = item[1]
        cid = item[4]
        if not name in cid_dict: 
            cid_dict[name] = str(cid)

    unknow_drug = open(folder + "unknow_drug_by_pychem.csv").readline().split(",")
    drug_cid_dict = {k:v for k,v in cid_dict.items() if k in unknow_drug and not is_not_float([v])}
    return drug_cid_dict

def load_cid_dict():
    reader = csv.reader(open(folder + "pychem_cid.csv"))
    pychem_dict = {}
    for item in reader:
        pychem_dict[item[0]] = item[1]
    pychem_dict.update(cid_from_other_source())
    return pychem_dict


def download_smiles():
    cids_dict = load_cid_dict()
    cids = [v for k,v in cids_dict.items()]
    inv_cids_dict = {v:k for k,v in cids_dict.items()}
    download('CSV', folder + 'drug_smiles.csv', cids, operation='property/CanonicalSMILES,IsomericSMILES', overwrite=True)
    f = open(folder + 'drug_smiles.csv')
    reader = csv.reader(f)
    header = ['name'] + next(reader)
    content = []
    for line in reader:
        content.append([inv_cids_dict[line[0]]] + line)
    f.close()
    f = open(folder + "drug_smiles.csv", "w")
    writer = csv.writer(f)
    writer.writerow(header)
    for item in content:
        writer.writerow(item)
    f.close()

In [None]:
write_drug_cid()
download_smiles()

# 检查csv是否错误

In [None]:
f = open('/data/ouyangzhenqiu/project/cloud_ecg/cwb/TCGA/drug_SMILES.csv',"rb")#二进制格式读文件
i = 0
while True:
    i += 1 
    print(i)
    line = f.readline()
    if not line:
        break
    else:
        try:
            line.decode('utf8')
        except:
            print(str(line))