### Package imports

In [3]:
from os import listdir, getcwd, rename, makedirs
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging
from PIL import Image
from tqdm import tqdm

### Common functions

In [4]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata


def sort_case_insensitive(sort_list):
    return sorted(sort_list, key=str.casefold)


### Common env

In [5]:
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"

## Generate XLSX

In [6]:
def generate_xlsx(path, provider):
    file_list = get_files(path)
    new_path = path.replace('/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/', '/Users/tushar/CancerModels/pdxfinder-data/data/data-submission/')
    if not exists(new_path):
        makedirs(new_path)
    files_to_extract = [f for f in file_list if f.endswith('.tsv') and f.__contains__(provider+'_metadata-')]
    if len(files_to_extract)>0:
        print("writing excel for: "+ provider)
        with pd.ExcelWriter(join(new_path, provider+"_metadata.xlsx")) as writer:
            for f in files_to_extract:
                metadata = read_metadata_with_fields(join(path, f))
                sheetname = f.replace(provider+"_metadata-", "").replace(".tsv", "")
                metadata.to_excel(writer, sheet_name=sheetname, index=False)
                print("\tSheet added: " + f)
    else:
        print("No .xlsx file found.")

In [7]:
for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
    #print("Working on provider: "+provider)
    generate_xlsx(join(home, provider), provider) ## File rename: pdx_models to pdx_model using the provider path

writing excel for: BROD
	Sheet added: BROD_metadata-patient_sample.tsv
	Sheet added: BROD_metadata-model_validation.tsv
	Sheet added: BROD_metadata-cell_model.tsv
	Sheet added: BROD_metadata-patient.tsv
	Sheet added: BROD_metadata-sharing.tsv
writing excel for: CCIA
	Sheet added: CCIA_metadata-model_validation.tsv
	Sheet added: CCIA_metadata-pdx_model.tsv
	Sheet added: CCIA_metadata-patient_sample.tsv
	Sheet added: CCIA_metadata-sharing.tsv
	Sheet added: CCIA_metadata-patient.tsv
writing excel for: CHOP
	Sheet added: CHOP_metadata-patient_sample.tsv
	Sheet added: CHOP_metadata-pdx_model.tsv
	Sheet added: CHOP_metadata-model_validation.tsv
	Sheet added: CHOP_metadata-sharing.tsv
	Sheet added: CHOP_metadata-patient.tsv
writing excel for: CMP
	Sheet added: CMP_metadata-patient_sample.tsv
	Sheet added: CMP_metadata-sharing.tsv
	Sheet added: CMP_metadata-patient.tsv
	Sheet added: CMP_metadata-model_validation.tsv
	Sheet added: CMP_metadata-cell_model.tsv
writing excel for: CRL
	Sheet added:

## Collection site typo


In [24]:
def get_collection_site(path, provider, cs_df):
    file_list = get_files(path)
    files_to_extract = [f for f in file_list if f.endswith('_metadata-patient_sample.tsv')]
    if len(files_to_extract)>0:
        #print("Reading Patient sample for: "+ provider)
        for f in files_to_extract:
            metadata = read_metadata_without_fields(join(path, f))
            metadata['provider'] = provider
            cs_df = pd.concat([cs_df, metadata[['collection_site', 'provider']]]).reset_index(drop=True)
    else:
        print("No file found.")
    return cs_df

In [58]:
PS_collection_site = pd.DataFrame()
for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
    PS_collection_site = get_collection_site(join(home, provider), provider, PS_collection_site) ## File rename: pdx_models to pdx_model using the provider path
Unique_CS = PS_collection_site.sort_values(by=['collection_site'], key=lambda col: col.str.lower()).drop_duplicates(subset=['provider', 'collection_site'], keep='first').reset_index(drop=True)

In [59]:
sort_case_insensitive(list(PS_collection_site.collection_site.unique()))

['4th ventricle',
 'Abdomen',
 'Abdomen Ascites',
 'Abdominal',
 'Abdominal Cavity',
 'Abdominal Mass',
 'Abdominal Wall',
 'Abdominopelvic Cavity',
 'Adrenal',
 'Adrenal Cortex',
 'Adrenal Gland',
 'Adrenal Mass',
 'Adrenal Resection',
 'Adrenal Tissue',
 'Alveolus',
 'Ampulla',
 'Ampulla of Vater',
 'Anal',
 'Anus',
 'Aortocaval',
 'Arm',
 'Ascending Colon',
 'Ascites',
 'Ascites fluid',
 'Axilla',
 'Axillary Lymph Node',
 'Back',
 'Back Mass',
 'Back soft tissue',
 'Base Of Tongue',
 'Bilateral',
 'Bile Duct',
 'Biliary Tract',
 'Bladder',
 'Bladder Dome',
 'Blood',
 'Bone',
 'Bone (Left Proximal Humerus)',
 'Bone (right humerus)',
 'Bone Marrow',
 'Bowel',
 'Brachial Muscle',
 'Brain',
 'Brain (Right Cerebellum, Right Parietal)',
 'Brain Stem',
 'Breast',
 'Bronchus',
 'Buccal Mucosa',
 'Buttock',
 'Caecum',
 'Calf',
 'Cecum',
 'Cerebellar Tentorium',
 'Cerebellum',
 'Cerebral hemisphere',
 'Cerebrospinal fluid ',
 'Cerebrum',
 'Cervical endometrium',
 'Cervical Lymph Node',
 'Cerv

In [56]:
list(Unique_CS.collection_site.unique())

['4th ventricle',
 'Abdomen',
 'Abdomen Ascites',
 'Abdominal',
 'Abdominal Cavity',
 'Abdominal Mass',
 'Abdominal Wall',
 'Abdominopelvic Cavity',
 'Adrenal',
 'Adrenal Cortex',
 'Adrenal Gland',
 'Adrenal Mass',
 'Adrenal Resection',
 'Adrenal Tissue',
 'Alveolus',
 'Ampulla',
 'Ampulla of Vater',
 'Anal',
 'Anus',
 'Aortocaval',
 'Arm',
 'Ascending Colon',
 'Ascites',
 'Ascites ',
 'Ascites fluid',
 'Axilla',
 'Axillary Lymph Node',
 'Back',
 'Back Mass',
 'Back soft tissue',
 'Base Of Tongue',
 'Bilateral',
 'Bile Duct',
 'Biliary Tract',
 'Bladder',
 'Bladder Dome',
 'Blood',
 'Bone',
 'Bone (Left Proximal Humerus)',
 'Bone (right humerus)',
 'Bone Marrow',
 'Bowel',
 'Brachial Muscle',
 'Brain',
 'Brain (Right Cerebellum, Right Parietal)',
 'Brain Stem',
 'Breast',
 'Bronchus',
 'Buccal Mucosa',
 'Buttock',
 'Caecum',
 'Calf',
 'Cecum',
 'Cerebellar Tentorium',
 'Cerebellum',
 'Cerebral hemisphere',
 'Cerebrospinal fluid ',
 'Cerebrum',
 'Cervical endometrium',
 'Cervical Lymph 

## Data type STRING to Number

In [None]:
def change_data_type(path, provider, mol_type):
    cols = {"cna": ["log10r_cna", "log2r_cna"],
            "mut": ["read_depth"], ## "allele_frequency" and "seq_start_position: cant be set as 0
            "expression": ["z_score", "rnaseq_tpm", "rnaseq_fpkm", "rnaseq_count",
                           "affy_hgea_expression_value", "illumina_hgea_expression_value"]} ## "rnaseq_coverage" cant be set as 0

    file_list = get_files(path)
    files_to_extract = [f for f in file_list if f.endswith('.tsv')]
    fillcols = cols[mol_type]
    if len(files_to_extract)>0:
        print("Reading Patient sample for: "+ provider)
        for f in files_to_extract:
            ## Implement some checks for expression columns
            file_path = join(path, f)
            metadata = read_metadata_without_fields(file_path)
            metadata[fillcols] = metadata[fillcols].fillna(0)
            metadata.to_csv(file_path, sep='\t', index=False)
    else:
        print(provider + ": No file found.")

In [None]:
mol_types = ["cna", "mut", "expression"]
for mol_type in mol_types:
    for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
        change_data_type(join(home, provider, mol_type), provider, mol_type) ## File rename: pdx_models to pdx_model using the

## JAX Histology Images
### Generate file list

In [55]:
JAX_Image_file_path = "/Users/tushar/CancerModels/submission/JAX - BIA/JAXPDXHistologyImageDetails.xlsx"
JAX_BIA_fileList_path = "/Users/tushar/CancerModels/submission/JAX - BIA/JAX_PDX_BIA_fileList.xlsx"
input_path = "/Users/tushar/CancerModels/submission/JAX - BIA/uploads"
output_path = "/Users/tushar/CancerModels/submission/JAX - BIA/submission"

In [64]:
def process_JAX_histology_excel(excel_in):
    JAX_BIA_file = pd.read_excel(excel_in).sort_values("model_id").reset_index(drop=True)
    JAX_BIA_file.loc[JAX_BIA_file.description.str.contains("PDX"), "sample_type"] = "pdx"
    JAX_BIA_file.loc[JAX_BIA_file.description.str.contains("Patient"), "sample_type"] = "patient"
    JAX_BIA_file['passage'] = JAX_BIA_file['description'].str[:2]
    JAX_BIA_file.loc[JAX_BIA_file.sample_type == 'patient', "passage"] = "-"
    JAX_BIA_file = JAX_BIA_file[JAX_BIA_file.sample_type == "pdx"]
    return JAX_BIA_file

def convert_JPEG_to_TIFF(in_file, out_file):
    im = Image.open(in_file)
    im.save(out_file, 'TIFF')


def generate_TIFF_and_fileList(dataframe, in_path, out_path, df_out_file):
    if not exists(out_path):
        makedirs(out_path)
    dataframe['Files'] = 'submission/'+ dataframe['model_id'] +"/"+ dataframe['file'].str.replace(".jpg", ".tiff")
    unique_models = dataframe.model_id.unique()
    for i in tqdm(range(0, len(unique_models)), desc ="Processing model images: "):
        model = unique_models[i]
        new_path = join(out_path, model)
        if not exists(new_path):
            makedirs(new_path)
        subset = dataframe[dataframe.model_id == model]
        files = list(subset.file)
        for file in files:
            out_file = file.replace(".jpg", ".tiff")
            convert_JPEG_to_TIFF(join(in_path, file), join(new_path, out_file))
    dataframe.to_excel(df_out_file, index=False)

def organise_data_for_BIA_submission(excel_in, excel_out, file_in, file_out):
    dataframe = process_JAX_histology_excel(excel_in)
    generate_TIFF_and_fileList(dataframe, file_in, file_out, excel_out)


In [65]:
organise_data_for_BIA_submission(JAX_Image_file_path, JAX_BIA_fileList_path, input_path, output_path)

Processing model images: 100%|██████████| 433/433 [00:30<00:00, 14.18it/s]


In [63]:
JAX_BIA_file['file'].str.replace(".jpg", ".tiff")

0       J000077451180207133606.tiff
1       J000077451180207134927.tiff
2       J000077451170412111437.tiff
3       J000077451170412111512.tiff
4       J000077451170412111321.tiff
                   ...             
2974       TM01634180706134907.tiff
2975       TM01634180706135125.tiff
2976       TM01634180706134831.tiff
2977       TM01634180706135058.tiff
2978       TM01634180706135036.tiff
Name: file, Length: 2676, dtype: object

## LIH Gene expression data

In [30]:
path = "/Users/tushar/CancerModels/submission/LIH/NORLUX_PDOX_gene_Expression/"
files = get_files(path)
mol_sample = pd.read_csv("/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/LIH/LIH_molecular_metadata-sample.tsv", sep='\t')

In [34]:
for i in tqdm(range(0, len(files)), desc ="Processing gene expression data: "):
    f = files[i]
    out_data = pd.read_csv("/Users/tushar/CancerModels/pdxfinder-data/template/active_templates/expression/expression_template-sheet.tsv", sep="\t")
    model = pd.read_excel(join(path, f))
    out_data[out_data.columns] = model[out_data.columns]
    out_data["platform_id"] = "expression_NovaSeq_6000"
    out_data.drop(['platform'], axis=1).to_csv( join(path, "LIH_expression_" + f.replace(".xlsx", ".tsv")), sep="\t", index=False)
    #print(model.iloc[0, ["model_id", "sample_id", "sample_origin", "passage", "host_strain_nomenclature"]])

Processing gene expression data: 100%|██████████| 28/28 [04:00<00:00,  8.60s/it]
