In [2]:
import pandas as pd
import os
from os.path import isfile, join, isdir, exists
import shutil
from tqdm import tqdm
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt

In [18]:
out_path = "/Users/tushar/pdx/pdxfinder-data/data/UPDOG/"

def get_files(path, files):
 [files.append(join(path, f)) if isfile(join(path, f)) else get_files(join(path,f), files) for f in os.listdir(path)]
 return files

def get_models(path, model_list):
 model_sheet = pd.read_csv(path, sep='\t')
 out_models_sheet = model_sheet[model_sheet['model_id'].isin(model_list)]
 return pd.concat([model_sheet.iloc[0:4, :], out_models_sheet]).reset_index(drop=True)

def get_patients(path_p, path_ps, out_path, provider_short, model_list):
 patient_sheet = pd.read_csv(path_p, sep='\t')
 patient_sample_sheet = pd.read_csv(path_ps, sep='\t')
 out_ps_sheet = patient_sample_sheet[patient_sample_sheet['model_id'].isin(model_list)]
 # Patient sheet
 p_ids = list(out_ps_sheet['patient_id'])
 out_patient_sheet = patient_sheet[patient_sheet['patient_id'].isin(p_ids)]
 out_patient_sheet = pd.concat([patient_sheet.iloc[0:4, :], out_patient_sheet]).reset_index(drop=True)
 out_patient_sheet.to_csv(join(out_path, provider_short+'_metadata-patient.tsv'), sep='\t', index=False)
 # Patient sample sheet
 out_ps_sheet = pd.concat([patient_sample_sheet.iloc[0:4, :], out_ps_sheet]).reset_index(drop=True)
 out_ps_sheet.to_csv(join(out_path, provider_short+'_metadata-patient_sample.tsv'), sep='\t', index=False)

def get_molecular_files(path, out_path, project, provider_short, model_list, data_type):
 if not exists(join(out_path, data_type)):
  os.mkdir(join(out_path, data_type))
 for model in model_list:
  file = join(path, data_type, project+'_'+data_type+'_'+ model +'.tsv')
  dest = join(out_path, data_type, provider_short+'_'+data_type+'_'+ model +'.tsv')
  try:
    shutil.copy(file, dest)
  # If source and destination are same
  except shutil.SameFileError:
     print("Source and destination represents the same file.")
  # If there is any permission issue
  except PermissionError:
     print("Permission denied.")
  # For other errors
  except:
     print("Error occurred while copying file: "+ dest)

def get_molecular_data(path, out_path, project, provider_short, model_list):
 mol_sample_sheet = pd.read_csv(join(path, project +"_molecular_metadata-sample.tsv"), sep='\t')
 shutil.copy(join(path, project +"_molecular_metadata-platform.tsv"), join(out_path, provider_short+'_molecular_metadata-platform.tsv'))
 shutil.copy(join(path, project +"_molecular_metadata-platform_web.tsv"), join(out_path, provider_short+'_molecular_metadata-platform_web.tsv'))
 out_mol_sample_sheet = mol_sample_sheet[mol_sample_sheet['model_id'].isin(model_list)]
 model_list = list(out_mol_sample_sheet['model_id'])
 sample_list = list(out_mol_sample_sheet['sample_id'])
 out_mol_sample_sheet = pd.concat([mol_sample_sheet.iloc[0:4, :], out_mol_sample_sheet]).reset_index(drop=True)
 out_mol_sample_sheet.to_csv(join(out_path, provider_short+'_molecular_metadata-sample.tsv'), sep='\t', index=False)

 if exists(join(path, 'cna')):
  get_molecular_files(path, out_path, project, provider_short, model_list, 'cna')
 if exists(join(path, 'expression')):
  get_molecular_files(path, out_path, project, provider_short, model_list, 'expression')

 if not exists(join(out_path, 'mut')):
  os.mkdir(join(out_path, 'mut'))

 if exists(join(path, 'mut', project+'_mut.tsv')):
  mut = pd.read_csv(join(path, 'mut', project+'_mut.tsv'), sep='\t')
  out_mut = mut[mut['sample_id'].isin(sample_list)]
  out_mut.to_csv(join(out_path, 'mut', provider_short+'_mut.tsv'), sep='\t', index=False)
 else:
  get_molecular_files(path, out_path, project, provider_short, model_list, 'mut')
 if not exists(join(out_path, 'treatment')):
   os.mkdir(join(out_path, 'treatment'))

 if exists(join(path, 'treatment', project+'_patienttreatment-Sheet1.tsv')):
  treatment = pd.read_csv(join(path, 'treatment', project+'_patienttreatment-Sheet1.tsv'), sep='\t')
  out_treatment = treatment[treatment['model_id'].isin(model_list)]
  out_treatment.to_csv(join(out_path, 'treatment', provider_short+'_patienttreatment-Sheet1.tsv'), sep='\t', index=False)


# PIVOT


In [3]:
providers = pd.read_csv("/Users/tushar/pdx/pdxfinder-data/data/UPDOG/PIVOT/PIVOT_metadata-sharing.tsv", sep='\t', skiprows=[1,2,3,4])
provider_name = list(providers.name.unique())

provider_contact_abrv = {provider_name[0]: "CCIA", provider_name[1]: "GCCRI", provider_name[2]: "CHOP", provider_name[3]: "LurieChildrens", provider_name[4]: "MDAnderson-CCH"}

provider_contact_dictionary = {provider_name[0]: "Children’s Cancer Institute", provider_name[1]: "Greehey Children’s Cancer Research Institute",
 provider_name[2]: "Children’s Hospital of Philadelphia", provider_name[3]: "Ann & Robert H. Lurie Children's Hospital of Chicago",
 provider_name[4]: "The University of Texas MD Anderson Children's Cancer Hospital"}

In [13]:
for provider in provider_name:
 provider_dir = join(out_path, provider_contact_abrv[provider])
 if not exists(provider_dir):
  os.mkdir(provider_dir)
 model_ids = list(providers[providers['name'] == provider]['model_id'])
 # PDX models sheet
 pdx_path = join(out_path, "PIVOT/PIVOT_metadata-pdx_model.tsv")
 get_models(pdx_path, model_ids).to_csv(join(provider_dir, provider_contact_abrv[provider]+'_metadata-pdx_model.tsv'), sep='\t', index=False)
 # Model validation sheet
 model_val_path = join(out_path, "PIVOT/PIVOT_metadata-model_validation.tsv")
 get_models(model_val_path, model_ids).to_csv(join(provider_dir, provider_contact_abrv[provider]+'_metadata-model_validation.tsv'), sep='\t', index=False)
 # Sharing sheet
 sharing_sheet_path = join(out_path, "PIVOT/PIVOT_metadata-sharing.tsv")
 get_models(sharing_sheet_path, model_ids).to_csv(join(provider_dir, provider_contact_abrv[provider]+'_metadata-sharing.tsv'), sep='\t', index=False)
 # Patient and patient sample sheet
 patient_sheet_path = join(out_path, "PIVOT/PIVOT_metadata-patient.tsv")
 patient_sample_sheet_path = join(out_path, "PIVOT/PIVOT_metadata-patient_sample.tsv")
 get_patients(patient_sheet_path, patient_sample_sheet_path, provider_dir, provider_contact_abrv[provider], model_ids)
 # Molecular data
 get_molecular_data(join(out_path, "PIVOT"), provider_dir, 'PIVOT', provider_contact_abrv[provider],  model_ids)

Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/CCIA/cna/CCIA_cna_ALL-17.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/CCIA/expression/CCIA_expression_ALL-17.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/cna/GCCRI_cna_KT-5.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/cna/GCCRI_cna_OS-25.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/cna/GCCRI_cna_OS-36.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/expression/GCCRI_expression_KT-5.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/expression/GCCRI_expression_OS-25.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/GCCRI/expression/GCCRI_expression_OS-36.tsv
Error occurred while copying file: /Users/tushar/pdx/pdxfinder-data/data/UPD

# HCMI

In [14]:
providers = pd.read_csv("/Users/tushar/pdx/pdxfinder-data/data/UPDOG/HCMI/HCMI_metadata-sharing.tsv", sep='\t', skiprows=[1,2,3,4])
provider_abrv = list(set([x[1] for x in providers.model_id.str.split('-')]))
provider_abrv
# CSHL: Cold Spring Harbor Laboratory
# BROD: Broad Institute of MIT and Harvard
# SANG: Wellcome Sanger Institute

['BROD', 'CSHL', 'SANG']

In [19]:
for source in provider_abrv:
 model_ids = list(providers.model_id[providers.model_id.str.contains('-'+source+'-')])

 provider_dir = join(out_path, source)
 if not exists(provider_dir):
  os.mkdir(provider_dir)
 # cell models sheet
 pdx_path = join(out_path, "HCMI/HCMI_metadata-cell_model.tsv")
 get_models(pdx_path, model_ids).to_csv(join(provider_dir, source+'_metadata-cell_model.tsv'), sep='\t', index=False)
 # Model validation sheet
 model_val_path = join(out_path, "HCMI/HCMI_metadata-model_validation.tsv")
 get_models(model_val_path, model_ids).to_csv(join(provider_dir, source+'_metadata-model_validation.tsv'), sep='\t', index=False)
 # Sharing sheet
 sharing_sheet_path = join(out_path, "HCMI/HCMI_metadata-sharing.tsv")
 get_models(sharing_sheet_path, model_ids).to_csv(join(provider_dir, source+'_metadata-sharing.tsv'), sep='\t', index=False)
 # Patient and patient sample sheet
 patient_sheet_path = join(out_path, "HCMI/HCMI_metadata-patient.tsv")
 patient_sample_sheet_path = join(out_path, "HCMI/HCMI_metadata-patient_sample.tsv")
 get_patients(patient_sheet_path, patient_sample_sheet_path, provider_dir, source, model_ids)
# Molecular data
 get_molecular_data(join(out_path, "HCMI"), provider_dir, 'HCMI', source,  model_ids)