In [1]:
from os import listdir, getcwd, rename, makedirs
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging

log = logging.getLogger(__name__)
logging.basicConfig(filename='merge_all_data_to_one.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG"

In [2]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

def read_mol_data(types, path, mol_sample):
    for mct in types:
        dir = ""
        if mct == 'expression':
            dir = 'expression'
        elif mct == 'copy number alteration':
            dir = 'cna'
        elif mct == 'mutation':
            dir = 'mut'
        elif mct == 'biomarker':
            dir = 'biomarker'
            
        if dir != "":
            files = [f for f in get_files(join(path, dir)) if f.endswith('.tsv')]
            if len(files) == 0:
                for d in get_dirs(join(path, dir)):
                    files = files + [join(d, f) for f in get_files(join(path, dir, d)) if f.endswith('.tsv')]
            if len(files) == 1:
                data = pd.read_csv(join(path, dir, files[0]), sep='\t')
                sample_ids = list(data['sample_id'].unique())
                mol_sample[dir] = sample_ids
            elif len(files) > 1:
                mol_sample[dir] = files

    return mol_sample    

def get_mol_files(row, mol_sample):
    dir = ""
    if row['molecular_characterisation_type'] == 'expression':
        dir = 'expression'
    elif row['molecular_characterisation_type'] == 'copy number alteration':
        dir = 'cna'
    elif row['molecular_characterisation_type'] == 'mutation':
        dir = 'mut'
    elif row['molecular_characterisation_type'] == 'biomarker':
        dir = 'biomarker'
    if dir != "":
        if row['sample_id'] in mol_sample[dir]:
            return [True, dir]
        else:
            for f in mol_sample[dir]:
                if str(f).__contains__(str(row['sample_id'])):
                    return [True, f]
                elif str(f).__contains__(str(row['model_id'])):
                    return [True, f]
    return [False, '']
    

def merge_data(path, data):
    tsv_files = sorted([f for f in get_files(path) if f.endswith('.tsv')])
    if len(tsv_files)>0:
        for f in tsv_files:
            if f.__contains__('patient.tsv') or f.__contains__('patient_sample.tsv') or f.__contains__('pdx_model.tsv') or f.__contains__('model_validation.tsv') or f.__contains__('cell_model.tsv') or f.__contains__('sharing.tsv'):
                #log.info("\t\tReading File: "+ f)
                #print(f)
                metadata = pd.read_csv(join(path,f), sep='\t', na_values="", low_memory=False)
                if 'Field' in metadata.columns:
                    metadata = metadata.loc[metadata.Field.astype('str').str.startswith('#') != True,].reset_index(drop=True)
                    metadata = metadata.drop('Field', axis=1)
                #metadata = metadata.iloc[4:,]
                if f.__contains__('patient.tsv'):
                    data[0] = data[0]._append(metadata)
                    log.info("\tFile: "+ f)
                elif f.__contains__('patient_sample.tsv'):
                    data[1] = data[1]._append(metadata)
                    log.info("\tFile: "+ f)
                elif f.__contains__('pdx_model.tsv'):
                    metadata['provider'] = path.replace(home, '')
                    data[2] = data[2]._append(metadata)
                    log.info("\tFile: "+ f)
                elif f.__contains__('model_validation.tsv'):
                    data[3] = data[3]._append(metadata)
                    log.info("\tFile: "+ f)
                elif f.__contains__('cell_model.tsv'):
                    data[4] = data[4]._append(metadata)
                    log.info("\tFile: "+ f)
                elif f.__contains__('sharing.tsv'):
                    data[5] = data[5]._append(metadata)
                    log.info("\tFile: "+ f)
                metadata = pd.DataFrame()
            if f.__contains__('_molecular_metadata-sample.tsv'):
                metadata = pd.read_csv(join(path,f), sep='\t', na_values="", low_memory=False)
                provider = path.replace(home+'/', '')
                platform = pd.read_csv(join(path,provider+'_molecular_metadata-platform.tsv'), sep='\t', na_values="", low_memory=False)
                if 'Field' in metadata.columns:
                    metadata = metadata.loc[metadata.Field.astype('str').str.startswith('#') != True,].reset_index(drop=True)
                    metadata = metadata.drop('Field', axis=1)
                if 'Field' in platform.columns:
                    platform = platform.loc[platform.Field.astype('str').str.startswith('#') != True,].reset_index(drop=True)
                    platform = platform.drop('Field', axis=1)
                metadata = metadata.merge(platform, on='platform_id', how='outer')
                mol_sample = {}
                mol_sample = read_mol_data(metadata['molecular_characterisation_type'].unique(), path, mol_sample)
                metadata['files'] = metadata.apply(get_mol_files, mol_sample=mol_sample, axis=1)
                metadata['provider'] = provider
                data[6] = data[6]._append(metadata)
                metadata = pd.DataFrame()
    else:
        log.info("No .xlsx file found.")
    
    return  data

patient, ps, pdx, mv, cm, share, files = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() , pd.DataFrame()
data = [patient, ps, pdx, mv, cm, share, files]
for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
    log.info("Working on provider: "+provider)
    data = merge_data(join(home, provider), data)

  data = pd.read_csv(join(path, dir, files[0]), sep='\t')
  data = pd.read_csv(join(path, dir, files[0]), sep='\t')
  data = pd.read_csv(join(path, dir, files[0]), sep='\t')


In [5]:
with pd.ExcelWriter('/Users/tushar/CancerModels/validation_all_data.xlsx') as writer:
    data[0].to_excel(writer, sheet_name='patient', index=False)
    data[1].to_excel(writer, sheet_name='patient_sample', index=False)
    data[2].to_excel(writer, sheet_name='pdx_model', index=False)
    data[3].to_excel(writer, sheet_name='model_validation', index=False)
    data[4].to_excel(writer, sheet_name='cell_model', index=False)
    data[5].to_excel(writer, sheet_name='sharing', index=False)

In [12]:
merged = pd.DataFrame
for i in range(5):
        if i == 0:
            merged = data[0].merge(data[1], on='patient_id', how='outer').reset_index(drop=True)     
        else:
            merged = merged.merge(data[i+1], on='model_id', how='outer').reset_index(drop=True)              

In [16]:
merged.to_csv('/Users/tushar/CancerModels/merged_all_data.tsv', sep='\t', index=False)

In [3]:
data[6][[True if t[0] == False else False for t in data[6]['files']]][['provider', 'model_id', 'sample_id', 'molecular_characterisation_type', 'platform_id']]

Unnamed: 0,provider,model_id,sample_id,molecular_characterisation_type,platform_id
52,Curie-BC,HBCx-2,BC51,mutation,mutation_NGS_ESOPE
53,Curie-BC,HBCx-8,BC138,mutation,mutation_NGS_ESOPE
54,Curie-BC,HBCx-30,BC291,mutation,mutation_NGS_ESOPE
55,Curie-BC,HBCx-31,BC297,mutation,mutation_NGS_ESOPE
56,Curie-BC,HBCx-39,BC408,mutation,mutation_NGS_ESOPE
...,...,...,...,...,...
3423,PDMR,K79811-243-R,K79811-243-R-M96,mutation,mutation_OncoKB_Gene_Panel
3424,PDMR,K79811-243-R,K79811-243-R-M94M108M143,mutation,mutation_OncoKB_Gene_Panel
47,PMLB,PMLB27459,27459,copy number alteration,copy_number_alteration_Targeted_Next_generatio...
24,TRACE,MEL0005CM,MEL0005CMH0000000000VT9900,mutation,mutation_RNA_sequencing
