In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import pubchempy as pcp

Read data

In [13]:
genetic_features = pd.read_csv('data/Cell_Line_Features_PANCAN_simple_MOBEM.tsv', delimiter='\t',index_col=None)
COSMIC_CancerType = pd.read_csv('data/Cell_Lines_Details.csv')
DrugID_Synonyms_TargetPathway = pd.read_csv('data/Drug_Features.csv')
COSMICID_DrugID_CCL = pd.read_csv('data/filteredResponses.csv')

In [14]:
vioxx = c = pcp.Compound.from_cid(479503)
print(vioxx.molecular_formula)
print(vioxx.molecular_weight)
print(vioxx.xlogp)

C16H16O5
288.29
3


rename columns with naming inconsistency

In [15]:
# COSMIC_CancerType.columns
COSMIC_CancerType.rename(columns={'COSMIC identifier': 'COSMIC_ID'}, inplace=True)
DrugID_Synonyms_TargetPathway.rename(columns={'Drug ID': 'DRUG_ID'}, inplace=True)
genetic_features_T=genetic_features.transpose()
genetic_features_T.to_csv('data/genetic_features_T.csv', index=True)
genetic_features_T= pd.read_csv('data/genetic_features_T.csv', skiprows=1)
genetic_features_T.rename(columns={"Unnamed: 0": 'COSMIC_ID'}, inplace=True)
genetic_features_T

Unnamed: 0,COSMIC_ID,ABCB1_mut,ABL2_mut,ACACA_mut,ACVR1B_mut,ACVR2A_mut,ADCY1_mut,AFF4_mut,AHCTF1_mut,AHNAK_mut,...,chr9:104248247-104249501(C9orf125)_HypMET,"chr9:115875199-115875738(C9orf109, C9orf110)_HypMET",chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET
0,1287381,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,924100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,910924,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,687561,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1287706,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,909701,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,753620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,905965,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999,1299061,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Merge data to the CCL table, dropped if no matching drug/cosmic id

In [16]:
COSMICID_DrugID_CCL_CancerType = pd.merge(COSMICID_DrugID_CCL, COSMIC_CancerType, on='COSMIC_ID')
COSMICID_DrugID_CCL_CancerType_Synonyms_TargetPathway = pd.merge(COSMICID_DrugID_CCL_CancerType, DrugID_Synonyms_TargetPathway, on='DRUG_ID')
# COSMICID_DrugID_CCL_CancerType_Synonyms_TargetPathway.columns

In [17]:
All_data=pd.merge(COSMICID_DrugID_CCL_CancerType_Synonyms_TargetPathway, genetic_features_T, on='COSMIC_ID')
All_data.to_csv('data/All_data.csv', index=True)

In [18]:




Shikonin_subset= All_data[All_data['DRUG_ID'] == 170]
Shikonin_subset.to_csv('data/Shikonin_subset.csv', index=True)
CAY10603_subset= All_data[All_data['DRUG_ID'] == 276]
CAY10603_subset.to_csv('data/CAY10603_subset.csv', index=True)


In [19]:
def train_val_test_split(dataframe,train_size=0.8,test_size=0.1,random_seed=42):
    train_val_df, test_df = train_test_split(dataframe, test_size=0.1, random_state=random_seed)
    train_df, val_df = train_test_split(train_val_df, test_size=1-train_size/(1-test_size), random_state=random_seed)  
    return train_df,val_df,test_df

def extract_domain_embedding(df):
    domain_dict_collection=[]
    for column in df.select_dtypes(include='object').columns:
        df.loc[df[column] == '', column] = '<empty>'
        labels, unique_values = pd.factorize(df[column])
        labels = labels + 1
        labels[df[column] == '<empty>'] = 0
        df.loc[:, f'encoded_{column}'] = labels
        domain_dict_collection.append(unique_values)
    return df,domain_dict_collection

Shikonin_subset_embedded,dict_list=extract_domain_embedding(Shikonin_subset)
Shikonin_subset_embedded.to_csv('data/Shikonin_subset_embedded.csv', index=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'encoded_{column}'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'encoded_{column}'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, f'encoded_{column}'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [20]:

column_types = Shikonin_subset_embedded.dtypes
condition_met = Shikonin_subset_embedded.columns.str.contains('HypMET')
condition_cna = Shikonin_subset_embedded.columns.str.contains('cna')
condition_mut = Shikonin_subset_embedded.columns.str.contains('mut')
# condition_dc = All_datas.columns.str.contains('dc') 

condition_encoded_Cancer_Type = Shikonin_subset_embedded.columns.str.contains('encoded_Cancer Type')

#TODO: find out what is dc
condition_fd_num = Shikonin_subset_embedded.columns.str.contains('fd_num')
condition_MAX_CONC = Shikonin_subset_embedded.columns.str.contains('MAX_CONC')
condition_norm_cells = Shikonin_subset_embedded.columns.str.contains('norm_cells')



In [21]:

conditions=[condition_met,condition_cna,condition_mut,condition_fd_num,condition_MAX_CONC,condition_norm_cells]
masks = [Shikonin_subset_embedded.loc[:, condition] for condition in conditions]

no_intersection = True
for i in range(len(masks)):
    for j in range(i + 1, len(masks)):
        intersection = masks[i].columns.intersection(masks[j].columns)
        if not intersection.empty:
            no_intersection = False
            print(f"Conditions {i+1} and {j+1} have intersection in columns: {list(intersection)}")

if no_intersection:
    print("All conditions are mutually exclusive (no intersection).")
else:
    print("Some conditions have intersections.")

All conditions are mutually exclusive (no intersection).


In [22]:
condition_all_X=condition_met | condition_cna|condition_mut|condition_fd_num|condition_MAX_CONC
condition_all_domain_X=condition_encoded_Cancer_Type
condition_all_y=condition_norm_cells

X_df=Shikonin_subset_embedded.loc[:, condition_all_X]
X_domain_info=Shikonin_subset_embedded.loc[:, condition_all_domain_X]
y_df=Shikonin_subset_embedded.loc[:, condition_all_y]

X_df = X_df.astype({col: 'float64' for col in X_df.columns})
X_domain_info = X_domain_info.astype({col: 'int' for col in X_domain_info.columns})

y_df = y_df.astype({col: 'float64' for col in y_df.columns})

with open('X_df.pkl', 'wb') as f:
    pickle.dump(X_df, f)
with open('X_domain_info.pkl', 'wb') as f:
    pickle.dump(X_domain_info, f)
with open('y_df.pkl', 'wb') as f:
    pickle.dump(y_df, f)