### Package imports

In [4]:
from os import listdir, getcwd, rename, makedirs
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging
from PIL import Image
from tqdm import tqdm

### Common functions

In [2]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata


def sort_case_insensitive(sort_list):
    return sorted(sort_list, key=str.casefold)


### Common env

In [3]:
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"

## Generate XLSX

In [6]:
def generate_xlsx(path, provider):
    file_list = get_files(path)
    new_path = path.replace('/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/', '/Users/tushar/CancerModels/pdxfinder-data/data/data-submission/')
    if not exists(new_path):
        makedirs(new_path)
    files_to_extract = [f for f in file_list if f.endswith('.tsv') and f.__contains__(provider+'_metadata-')]
    if len(files_to_extract)>0:
        print("writing excel for: "+ provider)
        with pd.ExcelWriter(join(new_path, provider+"_metadata.xlsx")) as writer:
            for f in files_to_extract:
                print("\tSheet added: " + f)
                metadata = read_metadata_with_fields(join(path, f))
                sheetname = f.replace(provider+"_metadata-", "").replace(".tsv", "")
                metadata.to_excel(writer, sheet_name=sheetname, index=False)
                
    else:
        print("No .xlsx file found.")

In [9]:
for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
    #print("Working on provider: "+provider)
    generate_xlsx(join(home, provider), provider) ## File rename: pdx_models to pdx_model using the provider path

writing excel for: BROD
	Sheet added: BROD_metadata-patient_sample.tsv
	Sheet added: BROD_metadata-model_validation.tsv
	Sheet added: BROD_metadata-cell_model.tsv
	Sheet added: BROD_metadata-patient.tsv
	Sheet added: BROD_metadata-sharing.tsv
writing excel for: CCIA
	Sheet added: CCIA_metadata-model_validation.tsv
	Sheet added: CCIA_metadata-pdx_model.tsv
	Sheet added: CCIA_metadata-patient_sample.tsv
	Sheet added: CCIA_metadata-sharing.tsv
	Sheet added: CCIA_metadata-patient.tsv
writing excel for: CHOP
	Sheet added: CHOP_metadata-patient_sample.tsv
	Sheet added: CHOP_metadata-pdx_model.tsv
	Sheet added: CHOP_metadata-model_validation.tsv
	Sheet added: CHOP_metadata-sharing.tsv
	Sheet added: CHOP_metadata-patient.tsv
writing excel for: CMP
	Sheet added: CMP_metadata-patient_sample.tsv
	Sheet added: CMP_metadata-sharing.tsv
	Sheet added: CMP_metadata-patient.tsv
	Sheet added: CMP_metadata-model_validation.tsv
	Sheet added: CMP_metadata-cell_model.tsv
writing excel for: CRL
	Sheet added:

## Collection site typo


In [24]:
def get_collection_site(path, provider, cs_df):
    file_list = get_files(path)
    files_to_extract = [f for f in file_list if f.endswith('_metadata-patient_sample.tsv')]
    if len(files_to_extract)>0:
        #print("Reading Patient sample for: "+ provider)
        for f in files_to_extract:
            metadata = read_metadata_without_fields(join(path, f))
            metadata['provider'] = provider
            cs_df = pd.concat([cs_df, metadata[['collection_site', 'provider']]]).reset_index(drop=True)
    else:
        print("No file found.")
    return cs_df

In [58]:
PS_collection_site = pd.DataFrame()
for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
    PS_collection_site = get_collection_site(join(home, provider), provider, PS_collection_site) ## File rename: pdx_models to pdx_model using the provider path
Unique_CS = PS_collection_site.sort_values(by=['collection_site'], key=lambda col: col.str.lower()).drop_duplicates(subset=['provider', 'collection_site'], keep='first').reset_index(drop=True)

In [59]:
sort_case_insensitive(list(PS_collection_site.collection_site.unique()))

['4th ventricle',
 'Abdomen',
 'Abdomen Ascites',
 'Abdominal',
 'Abdominal Cavity',
 'Abdominal Mass',
 'Abdominal Wall',
 'Abdominopelvic Cavity',
 'Adrenal',
 'Adrenal Cortex',
 'Adrenal Gland',
 'Adrenal Mass',
 'Adrenal Resection',
 'Adrenal Tissue',
 'Alveolus',
 'Ampulla',
 'Ampulla of Vater',
 'Anal',
 'Anus',
 'Aortocaval',
 'Arm',
 'Ascending Colon',
 'Ascites',
 'Ascites fluid',
 'Axilla',
 'Axillary Lymph Node',
 'Back',
 'Back Mass',
 'Back soft tissue',
 'Base Of Tongue',
 'Bilateral',
 'Bile Duct',
 'Biliary Tract',
 'Bladder',
 'Bladder Dome',
 'Blood',
 'Bone',
 'Bone (Left Proximal Humerus)',
 'Bone (right humerus)',
 'Bone Marrow',
 'Bowel',
 'Brachial Muscle',
 'Brain',
 'Brain (Right Cerebellum, Right Parietal)',
 'Brain Stem',
 'Breast',
 'Bronchus',
 'Buccal Mucosa',
 'Buttock',
 'Caecum',
 'Calf',
 'Cecum',
 'Cerebellar Tentorium',
 'Cerebellum',
 'Cerebral hemisphere',
 'Cerebrospinal fluid ',
 'Cerebrum',
 'Cervical endometrium',
 'Cervical Lymph Node',
 'Cerv

In [56]:
list(Unique_CS.collection_site.unique())

['4th ventricle',
 'Abdomen',
 'Abdomen Ascites',
 'Abdominal',
 'Abdominal Cavity',
 'Abdominal Mass',
 'Abdominal Wall',
 'Abdominopelvic Cavity',
 'Adrenal',
 'Adrenal Cortex',
 'Adrenal Gland',
 'Adrenal Mass',
 'Adrenal Resection',
 'Adrenal Tissue',
 'Alveolus',
 'Ampulla',
 'Ampulla of Vater',
 'Anal',
 'Anus',
 'Aortocaval',
 'Arm',
 'Ascending Colon',
 'Ascites',
 'Ascites ',
 'Ascites fluid',
 'Axilla',
 'Axillary Lymph Node',
 'Back',
 'Back Mass',
 'Back soft tissue',
 'Base Of Tongue',
 'Bilateral',
 'Bile Duct',
 'Biliary Tract',
 'Bladder',
 'Bladder Dome',
 'Blood',
 'Bone',
 'Bone (Left Proximal Humerus)',
 'Bone (right humerus)',
 'Bone Marrow',
 'Bowel',
 'Brachial Muscle',
 'Brain',
 'Brain (Right Cerebellum, Right Parietal)',
 'Brain Stem',
 'Breast',
 'Bronchus',
 'Buccal Mucosa',
 'Buttock',
 'Caecum',
 'Calf',
 'Cecum',
 'Cerebellar Tentorium',
 'Cerebellum',
 'Cerebral hemisphere',
 'Cerebrospinal fluid ',
 'Cerebrum',
 'Cervical endometrium',
 'Cervical Lymph 

## Data type STRING to Number

In [32]:
def change_data_type(path, provider, mol_type):
    cols = {"cna": ["log10r_cna", "log2r_cna"],
            "mut": ["read_depth"], ## "allele_frequency" and "seq_start_position: cant be set as 0
            "expression": ["z_score", "rnaseq_tpm", "rnaseq_fpkm", "rnaseq_count",
                           "affy_hgea_expression_value", "illumina_hgea_expression_value"]} ## "rnaseq_coverage" cant be set as 0

    file_list = get_files(path)
    files_to_extract = [f for f in file_list if f.endswith('.tsv')]
    fillcols = cols[mol_type]
    if len(files_to_extract)>0:
        print("Reading Patient sample for: "+ provider)
        for f in files_to_extract:
            ## Implement some checks for expression columns
            file_path = join(path, f)
            metadata = read_metadata_without_fields(file_path)
            metadata[fillcols] = metadata[fillcols].fillna(0)
            metadata.to_csv(file_path, sep='\t', index=False)
    else:
        print(provider + ": No file found.")

In [33]:
mol_types = ["cna", "mut", "expression"]
for mol_type in mol_types:
    for provider in sorted(get_dirs(home)): ## get_dirs will get the provider dirs in updog
        change_data_type(join(home, provider, mol_type), provider, mol_type) ## File rename: pdx_models to pdx_model using the

BROD: No file found.
Reading Patient sample for: CCIA
Reading Patient sample for: CHOP
Reading Patient sample for: CMP
Reading Patient sample for: CRL
CSHL: No file found.
Reading Patient sample for: CUIMC
Curie-BC: No file found.
Reading Patient sample for: Curie-LC
Curie-OC: No file found.
DFCI-CPDM: No file found.
Reading Patient sample for: GCCRI
HCI-BCM: No file found.
HKU: No file found.
Reading Patient sample for: IRCC-CRC
Reading Patient sample for: IRCC-GC
Reading Patient sample for: JAX
Reading Patient sample for: LIH
Reading Patient sample for: LurieChildrens
MDAnderson: No file found.
Reading Patient sample for: MDAnderson-CCH
Reading Patient sample for: NKI
PDMR: No file found.
Reading Patient sample for: PMLB
PMLB-Organoid: No file found.
SANG: No file found.
SJCRH: No file found.
TRACE: No file found.
UCD: No file found.
UMCG: No file found.
Reading Patient sample for: UOC-BC
UOM-BC: No file found.
VHIO-BC: No file found.
VHIO-CRC: No file found.
VHIO-PC: No file found.


1. BROD: 42
2. CCIA: 1
3. CHOP: 1
4. CMP: 2
5. CRL: 1
6. CSHL: 38
7. CUIMC: 2
8. Curie-BC: 5
9. Curie-LC: 2
10. GCCRI: 1
11. IRCC-CRC: 1
12. IRCC-GC: 1
13. JAX: 354
14. LIH: 2
15. LurieChildrens: 1
16. MDAnderson-CCH: 1
17. NKI: 7
18. PDMR: 2
19. PMLB: 3
20. TRACE: 2
21. UOC-BC: 2
22. UOM-BC: 1
23. VHIO-CRC: 2
49.375


## JAX Histology Images
### Generate file list

In [55]:
JAX_Image_file_path = "/Users/tushar/CancerModels/submission/JAX - BIA/JAXPDXHistologyImageDetails.xlsx"
JAX_BIA_fileList_path = "/Users/tushar/CancerModels/submission/JAX - BIA/JAX_PDX_BIA_fileList.xlsx"
input_path = "/Users/tushar/CancerModels/submission/JAX - BIA/uploads"
output_path = "/Users/tushar/CancerModels/submission/JAX - BIA/submission"

In [45]:
def process_JAX_histology_excel(excel_in):
    JAX_BIA_file = pd.read_excel(excel_in).sort_values("model_id").reset_index(drop=True)
    JAX_BIA_file.loc[JAX_BIA_file.description.str.contains("PDX"), "sample_type"] = "pdx"
    JAX_BIA_file.loc[JAX_BIA_file.description.str.contains("Patient"), "sample_type"] = "patient"
    JAX_BIA_file['passage'] = JAX_BIA_file['description'].str[:2]
    JAX_BIA_file.loc[JAX_BIA_file.sample_type == 'patient', "passage"] = "-"
    JAX_BIA_file = JAX_BIA_file[JAX_BIA_file.sample_type == "pdx"]
    return JAX_BIA_file

def convert_JPEG_to_TIFF(in_file, out_file):
    im = Image.open(in_file)
    im.save(out_file, 'TIFF')


def generate_TIFF_and_fileList(dataframe, in_path, out_path, df_out_file):
    if not exists(out_path):
        makedirs(out_path)
    dataframe['Files'] = 'submission/'+ dataframe['model_id'] +"/"+ dataframe['file'].str.replace(".jpg", ".tiff")
    unique_models = dataframe.model_id.unique()
    for i in tqdm(range(0, len(unique_models)), desc ="Processing model images: "):
        model = unique_models[i]
        new_path = join(out_path, model)
        if not exists(new_path):
            makedirs(new_path)
        subset = dataframe[dataframe.model_id == model]
        files = list(subset.file)
        for file in files:
            out_file = file.replace(".jpg", ".tiff")
            convert_JPEG_to_TIFF(join(in_path, file), join(new_path, out_file))
    dataframe.to_excel(df_out_file, index=False)

def organise_data_for_BIA_submission(excel_in, excel_out, file_in, file_out):
    dataframe = process_JAX_histology_excel(excel_in)
    generate_TIFF_and_fileList(dataframe, file_in, file_out, excel_out)


In [52]:
organise_data_for_BIA_submission(JAX_Image_file_path, JAX_BIA_fileList_path, input_path, output_path)

In [53]:
JAX_BIA_file['file'].str.replace(".jpg", ".tiff")

Processing model images: 100%|██████████| 433/433 [00:56<00:00,  7.73it/s]


## LIH Gene expression data

In [30]:
path = "/Users/tushar/CancerModels/submission/LIH/NORLUX_PDOX_gene_Expression/"
files = get_files(path)
mol_sample = pd.read_csv("/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/LIH/LIH_molecular_metadata-sample.tsv", sep='\t')

In [34]:
for i in tqdm(range(0, len(files)), desc ="Processing gene expression data: "):
    f = files[i]
    out_data = pd.read_csv("/Users/tushar/CancerModels/pdxfinder-data/template/active_templates/expression/expression_template-sheet.tsv", sep="\t")
    model = pd.read_excel(join(path, f))
    out_data[out_data.columns] = model[out_data.columns]
    out_data["platform_id"] = "expression_NovaSeq_6000"
    out_data.drop(['platform'], axis=1).to_csv( join(path, "LIH_expression_" + f.replace(".xlsx", ".tsv")), sep="\t", index=False)
    #print(model.iloc[0, ["model_id", "sample_id", "sample_origin", "passage", "host_strain_nomenclature"]])

Processing gene expression data: 100%|██████████| 28/28 [04:00<00:00,  8.60s/it]


# PIVOT mol data cleanup

In [7]:
PIVOT = ["CCIA","CHOP", "GCCRI", "LurieChildrens", "MDAnderson-CCH"]

In [8]:
def float_2_int(col):
    return [int(x) if x!= '' else x for x in col.fillna('')]
def convert_amino_acid_position(value):
    if isinstance(value, str):
        # Split the string by non-numeric characters and take the first part
        position_str = value[1:-1]  
        if position_str.__contains__('-'):
            return value[0] + str(position_str) + value[-1]
        return value[0] + str(int(float(position_str))) + value[-1]
    # Return NaN for non-matching or NaN values
    return ''

def drop_dec(path, dt):
    for file in get_files(path):
        fp = join(path, file)
        data = pd.read_csv(fp, sep='\t')
        if dt == 'cna':
          data.strand = data.strand.fillna(1.0).astype(int)
          data.seq_start_position = float_2_int(data.seq_start_position)
          data.seq_end_position = float_2_int(data.seq_end_position)
          data.ncbi_gene_id = float_2_int(data.ncbi_gene_id)
        elif dt == 'expression':
            data.ensembl_gene_id = float_2_int(data.ensembl_gene_id)
        elif dt == 'mut':
            data['amino_acid_change'] = data['amino_acid_change'].apply(convert_amino_acid_position)
        data = data.iloc[:, 1:]
        data.to_csv(fp, sep='\t', index=False)
def drop_decimal(provider):
    dirs = get_dirs(provider)
    dt = ['cna', 'expression', 'mut']
    for d in dt:
        if d in dirs:
            new_path = join(provider, d)
            drop_dec(new_path, d)    

In [9]:
for i in tqdm(range(0, len(PIVOT)), desc ="Processing PIVOT data: "):
    provider = PIVOT[i]
    drop_decimal(join(home, provider))

Processing PIVOT data: 100%|██████████| 5/5 [03:29<00:00, 41.87s/it]


In [10]:
PDMR_path = join(home, "PDMR/expression/")

In [12]:
for f in get_files(PDMR_path):
    data = read_metadata_with_fields(join(PDMR_path, f))
    data.drop("platform", axis=1, inplace=True)
    data.to_csv(join(PDMR_path, f), index=False, sep='\t')

In [11]:
missing_ids = "/Users/tushar/Downloads/model_molecular_metadata_202310301437.csv"
missing_ids = pd.read_csv(missing_ids).sort_values("data_source").reset_index(drop=True)

In [36]:
missing_ids

Unnamed: 0,model_id,data_source,source,sample_id,xenograft_passage,raw_data_url,data_type,platform_name,data_exists,data_restricted,molecular_characterization_id,external_db_links
0,CRL-3044,CRL,xenograft,A4632_(GenomeWideSNP_6),2.0,A4632_(GenomeWideSNP_6).CEL,copy number alteration,Affymetrix Genome-Wide Human SNP Array 6.0,False,True,1726576853006,[]
1,CRL-3039,CRL,xenograft,A6297_(GenomeWideSNP_6),0.0,A6297_(GenomeWideSNP_6).CEL,copy number alteration,Affymetrix Genome-Wide Human SNP Array 6.0,False,True,3281355014163,[]
2,CRL-3040,CRL,xenograft,A8251_(GenomeWideSNP_6),0.0,A8251_(GenomeWideSNP_6).CEL,copy number alteration,Affymetrix Genome-Wide Human SNP Array 6.0,False,True,2972117368850,[]
3,CRL-3036,CRL,xenograft,A9492_(GenomeWideSNP_6),1.0,A9492_(GenomeWideSNP_6).CEL,copy number alteration,Affymetrix Genome-Wide Human SNP Array 6.0,False,True,3083786518542,[]
4,CRL-3039,CRL,xenograft,A8445_(GenomeWideSNP_6),2.0,A8445_(GenomeWideSNP_6).CEL,copy number alteration,Affymetrix Genome-Wide Human SNP Array 6.0,False,True,3118146256916,[]
...,...,...,...,...,...,...,...,...,...,...,...,...
240,834989-109-R,PDMR,xenograft,834989-109-R-G5YVH2,1.0,,mutation,Illumina HiSeq 2000/2500,False,False,3350074490895,
241,395191-088-T,PDMR,xenograft,395191-088-T-H5QU77W68,2.0,,mutation,Illumina HiSeq 2000/2500,False,False,3375844294660,
242,K41856-061-R2,PDMR,xenograft,K41856-061-R2-C61FU2,25.0,,mutation,Illumina HiSeq 2000/2500,False,False,1812476198916,
243,428932-153-R,PDMR,patient,428932-153-R,,ERR4627966,mutation,Illumina HiSeq 2000/2500,False,False,816043786241,"[{""column"": ""raw_data_url"", ""resource"": ""ENA"",..."


In [57]:
for pro in missing_ids.data_source.unique():
    pro_path = join(home, pro)
    print(pro)
    missing_meta = missing_ids[missing_ids['data_source'] == pro]
    sample = read_metadata_with_fields(join(pro_path, pro+"_molecular_metadata-sample.tsv"))
    platform = read_metadata_with_fields(join(pro_path, pro+"_molecular_metadata-platform.tsv"))
    joined = sample.merge(platform[["platform_id", "molecular_characterisation_type", "instrument_model"]], on='platform_id', how='left')
    missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
    joined = joined.merge(missing_meta[['sample_id', 'molecular_characterisation_type', 'instrument_model', 'data_exists']], on =['sample_id', 'molecular_characterisation_type', 'instrument_model'], how='left', indicator=True)
    joined = joined[joined['_merge'] == 'left_only']    #joined = joined[~joined['sample_id'].isin(missing_meta['sample_id'])]
    joined = joined[sample.columns].reset_index(drop=True)
    joined.to_csv(join(pro_path, pro+"_molecular_metadata-sample.tsv"), sep='\t', index=False)

CRL
Curie-BC
IRCC-CRC
JAX
LIH
PDMR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [55]:
for pro in missing_ids.data_source.unique():
    pro_path = join(home, pro)
    print(pro)
    missing_meta = missing_ids[missing_ids['data_source'] == pro]
    sample = read_metadata_with_fields(join(pro_path, pro+"_molecular_metadata-sample.tsv"))
    platform = read_metadata_with_fields(join(pro_path, pro+"_molecular_metadata-platform.tsv"))
    joined = sample.merge(platform[["platform_id", "molecular_characterisation_type", "instrument_model"]], on='platform_id', how='left')
    missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
    joined = joined.merge(missing_meta[['sample_id', 'molecular_characterisation_type', 'instrument_model', 'data_exists']], on =['sample_id', 'molecular_characterisation_type', 'instrument_model'], how='left', indicator=True)
    joined = joined[joined['_merge'] == 'left_only']    #joined = joined[~joined['sample_id'].isin(missing_meta['sample_id'])]
    joined = joined[sample.columns].reset_index(drop=True)
    joined.to_csv(join(pro_path, pro+"_molecular_metadata-sample.tsv"), sep='\t', index=False)

CRL
['mutation_Illumina_WES' 'copy_number_alteration_Affymetrix_SNP6.0'
 'mutation_RNA_sequencing' 'expression_Affymetrix_HGU133plus2'
 'expression_ILLUMINA' 'cytogenetics_Immunohistochemistry']
(1910, 10)
(1910, 12)
(1895, 10)
Curie-BC
['cytogenetics_immunohistochemistry' 'mutation_NGS_ESOPE'
 'mutation_NGS_DRAGON']
(85, 10)
(85, 12)
(56, 10)
IRCC-CRC
['expression_Illumina_HT-12_v4_microarray' 'mutation_TargetedNGS_MUT'
 'mutation_whole_exome_sequencing'
 'copy_number_alteration_whole_exome_sequencing']
(1407, 10)
(1407, 12)
(1404, 10)
JAX
['mutation_CTP' 'mutation_Whole_Exome' 'mutation_Truseq_JAX'
 'mutation_Other:_ddPCR' 'mutation_Actionseq20' 'mutation_Other:_FISH'
 'mutation_TEX_DFCI' 'mutation_CTPTN' 'mutation_Other:_DFCI_RHP20'
 'copy_number_alteration_SNP' 'expression_RNA_Seq' 'expression_stRNA_Seq'
 'expression_hg10st' 'expression_hu133']
(1722, 10)
(1722, 12)
(1605, 10)
LIH
['copy_number_alteration_Agilent_aCGH_2x400k'
 'copy_number_alteration_Agilent_aCGH_4x180k' 'mutation_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_meta['molecular_characterisation_type'], missing_meta['instrument_model'] = missing_meta['data_type'], missing_meta['platform_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [9]:
normal = pd.read_csv("/Users/tushar/Downloads/TCGA-LUAD.htseq_fpkm.tsv", sep='\t').iloc[:,0:15]

In [14]:
normal[normal['Ensembl_ID'].str.contains("133703")].reset_index(drop=True).to_csv("/Users/tushar/Downloads/Normal-FPKM-COLON.tsv",sep='\t',index=False)

In [None]:
ENSG00000146648.14
ENSG00000171094.14


In [12]:
normal[normal['Ensembl_ID'].isin(["ENSG00000146648.14", "ENSG00000171094.14"])].reset_index(drop=True).to_csv("/Users/tushar/Downloads/Normal-FPKM-LUNG.tsv",sep='\t',index=False)

# Immune markers 
### Addition of immune markers -specifically model genomics in the data as not provided
MSI, TMB, MMR, Ploidy, Mutations per mb
- MSI: immune_msi
- MMR: immune_mmr 
- TMB: immune_tmb
- MPM: immune_mpm
- Ploidy: immune_ploidy

[['PDMR', ['immune_msi', 'immune_HLA_type']],
 ['JAX', ['immune_tmb', 'immune_msi']],
 ['CMP', ['immune_mpm', 'immune_ploidy', 'immune_msi']],
 ['CUIMC', ['immune_tmb']]]
 

[['PDMR', ['immune_msi', 'immune_HLA_type']],
 ['CRL', ['immune_HLA_type']],
 ['JAX', ['immune_tmb', 'immune_msi']],
 ['SJCRH', ['immune_hla_type']],
 ['CMP', ['immune_mpm', 'immune_ploidy', 'immune_msi']],
 ['DFCI-CPDM', ['immune_HLA_type']],
 ['CUIMC', ['immune_tmb']]]


In [93]:
model_genomics_markers = pd.read_json('https://dev.cancermodels.org/api/immunemarker_data_extended?marker_type=eq.Model%20Genomics')

In [95]:
marker_type = list(model_genomics_markers.marker_name.unique())
na_str = "Not provided"
marker_type.append('MMR')
marker_type_platform_dict = {'MSI': 'immune_msi', 'MMR': 'immune_mmr', 
                             'TMB': 'immune_tmb', 'Mutations per mb': 'immune_mpm', 
                             'Ploidy': 'immune_ploidy'}
platform_marker_type_dict = dict(zip(marker_type_platform_dict.values(), marker_type_platform_dict.keys()))
providers = list(model_genomics_markers.data_source.unique())
platform_ids = []

In [134]:
def get_available_marker_for_sample_id(sample_id, df, platform_list):
    temp = df[df['sample_id'] == sample_id]
    pids = temp['platform_id'].unique()
    add_markers = [x for x in platform_list if x not in pids]
    return [[sample_id, x] for x in add_markers]
    
    
def get_ms_row(sample_id, df, platform):
    df2 = df
    df = df.drop_duplicates(subset=['model_id', 'sample_id']).reset_index(drop=True)
    df = df[df['sample_id'] == sample_id].reset_index(drop=True)
    try:
        mid = df['model_id'][0]
    except:
        print(df)
        print(sample_id)
        print(df2[sample_id].unique())
    so = df['sample_origin'][0]
    pa = df['passage'][0]
    hsn = df['host_strain_name'][0]
    hsno = df['host_strain_nomenclature'][0]
    etcs = df['engrafted_tumor_collection_site'][0]
    rdu = df['raw_data_url'][0]
    #print(mid)
    
    # ['Field', 'model_id', 'sample_id', 'sample_origin', 'passage', 'host_strain_name', 'host_strain_nomenclature', 'engrafted_tumor_collection_site', 'raw_data_url', 'platform_id']
    temp = ['', mid, sample_id, so, pa, hsn, hsno, etcs, rdu, platform]
    return temp
    
def get_im_row(sample_id, na_str, marker_dict, platform):
    #['Field', 'sample_id', 'marker_type', 'marker_name', 'marker_value', 'essential_or_additional_details', 'platform_id']
    marker_name = marker_dict[platform]
    temp = ['', sample_id, 'Model Genomics', marker_name, na_str, '', platform]
    return temp
    
def append_to_df(row, df):
    row = pd.DataFrame([row], columns=df.columns)
    return pd.concat([df, row]).reset_index(drop=True)
            
for i in range(len(providers)):
    provider = providers[i]
    print(provider)
    provider_path = join(home, provider)
    mol_sample_sheet = read_metadata_with_fields(join(provider_path, provider+"_molecular_metadata-sample.tsv"))
    mol_platform = read_metadata_with_fields(join(provider_path, provider+"_molecular_metadata-platform.tsv"))
    mol_platform_web = read_metadata_with_fields(join(provider_path, provider+"_molecular_metadata-platform_web.tsv"))
    immunemarker = read_metadata_with_fields(join(provider_path, 'immunemarker', provider+"_immunemarker-Sheet1.tsv"))

    pids_in_sheet = list(immunemarker.platform_id.unique())
    add_markers = [x for x in marker_type_platform_dict.values() if x not in pids_in_sheet]
    print(add_markers)
    if len(add_markers)>0:
        sample_ids = immunemarker['sample_id'].unique()
        for j in tqdm(range(len(sample_ids)), f'Adding immune marker data for {provider}: '):
            sample_id = sample_ids[j]
            to_be_added = get_available_marker_for_sample_id(sample_id, immunemarker, marker_type_platform_dict.values())
            for row in to_be_added:
                immunemarker = append_to_df(get_im_row(row[0], na_str, platform_marker_type_dict, row[1]), immunemarker)            
                mol_sample_sheet = append_to_df(get_ms_row(row[0], mol_sample_sheet, row[1]), mol_sample_sheet)
        for mark in add_markers:
            mol_platform = append_to_df(['', mark, 'immunemarker', na_str, na_str, '', '', ''], mol_platform)
            mol_platform_web = append_to_df(['', mark, '', '', ''], mol_platform_web)
    
    mol_sample_sheet.to_csv(join(provider_path, provider+"_molecular_metadata-sample.tsv"), sep='\t', index=False)
    mol_platform.to_csv(join(provider_path, provider+"_molecular_metadata-platform.tsv"), sep='\t', index=False)
    mol_platform_web.to_csv(join(provider_path, provider+"_molecular_metadata-platform_web.tsv"), sep='\t', index=False)
    immunemarker.to_csv(join(provider_path, 'immunemarker', provider+"_immunemarker-Sheet1.tsv"), sep='\t', index=False)
    
        
    platform_id = mol_sample_sheet['platform_id'].iloc[4:].unique()
    platform_id =  [x for x in platform_id if x.__contains__('immune')]
    platform_ids.append([provider, platform_id])

PDMR
['immune_mmr', 'immune_tmb', 'immune_mpm', 'immune_ploidy']


Adding immune marker data for PDMR: 100%|██████████| 510/510 [00:22<00:00, 23.17it/s]


JAX
['immune_mmr', 'immune_mpm', 'immune_ploidy']


Adding immune marker data for JAX: 100%|██████████| 296/296 [00:06<00:00, 43.74it/s]


CMP
['immune_mmr', 'immune_tmb']


Adding immune marker data for CMP: 100%|██████████| 1174/1174 [00:32<00:00, 36.55it/s]


CUIMC
['immune_msi', 'immune_mmr', 'immune_mpm', 'immune_ploidy']


Adding immune marker data for CUIMC: 100%|██████████| 32/32 [00:00<00:00, 35.11it/s]


In [130]:
mol_platform_web.columns

Index(['Field', 'platform_id', 'molecular_methods_description',
       'analysis_description', 'data_acquisition_description', 'table'],
      dtype='object')

In [132]:
providers

['PDMR', 'JAX', 'CMP', 'CUIMC']

In [None]:
from os import listdir
from os.path import isfile, join
import pandas as pd

CMP_path = '/hps/nobackup/tudor/pdcm/pdxfinder-data/data/UPDOG/CMP/'
CMP_mut_path = '/hps/nobackup/tudor/pdcm/pdxfinder-data/data/UPDOG/CMP/mut'

def get_files(path):
    return [join(path, f) for f in listdir(path) if isfile(join(path, f))]

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

#mms_header = read_metadata_with_fields(join(CMP_path, 'CMP_molecular_metadata-sample.tsv')).iloc[0:4]
#ps = read_metadata_without_fields(join(CMP_path, 'CMP_molecular_metadata-sample.tsv'))[['sample_id', 'model_id']]
#ps = ps.drop_duplicates().reset_index(drop=True)
#ps = dict(zip(ps['sample_id'], ps['model_id']))
mms = read_metadata_without_fields(join(CMP_path, 'CMP_molecular_metadata-sample.tsv'))[['sample_id', 'model_id']]
mms = mms.drop_duplicates().reset_index(drop=True)
mms_mapper = dict(zip(mms['sample_id'], mms['model_id']))
#replaced_mms = mms[['model_id', 'sample_id']].replace(ps)
#mms['model_id'] = replaced_mms['sample_id']
mut_file = read_metadata_with_fields(join(CMP_mut_path, 'CMP_mut.tsv'))
samples = mms['sample_id'].unique()
for i in tqdm(range(len(samples))):
    s = samples[i]
    temp = mut_file[mut_file['sample_id'] == s].reset_index(drop=True)
    if len(temp) > 0:
        file_name = join(CMP_mut_path, f"CMP_mut_{s}.tsv")
        temp.to_csv(file_name, sep='\t', index=False)

In [20]:
from utils import read_metadata_with_fields, read_metadata_without_fields, get_files
from os.path import join, exists
from os import remove
from pandas import read_csv as read, concat, to_numeric
GCCRI_path = '/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/GCCRI'

In [30]:
pivot_id_path = '~/CancerModels/submission/PIVOT_IDs.csv'
pivot_ids = read(pivot_id_path, encoding='utf-8')
mapper = {'LURIE': 'LurieChildrens', 'MDA': 'MDAnderson-CCH', 'UTHSCSA': 'GCCRI'}
pivot_ids['provider'] = pivot_ids['PIVOT Center (Model Originator)'].replace(mapper)
pivot_providers = list(pivot_ids['provider'].unique())
GCCRI = pivot_ids[pivot_ids['provider'] == 'GCCRI']

In [8]:
pdx = read_metadata_without_fields(join(GCCRI_path, 'GCCRI_metadata-pdx_model.tsv'))
new_model_id = [mid.replace('-', '').replace('NCH', 'NCH-').replace('1NCH', '1-NCH').replace('2NCH', '2-NCH') for mid in pdx['model_id']]
new_model_id = [mid.replace('S13', '-S13_').replace('--', '-') for mid in new_model_id]
mapper = dict(zip(list(pdx['model_id']), new_model_id))
ps_mapper = dict(zip(list(pdx['model_id'] + '_P'), [n + '_P' for n in new_model_id]))
#pdx.replace(mapper).to_csv(join(GCCRI_path, 'GCCRI_metadata-pdx_model.tsv'), sep='\t', index=False)

In [9]:
def fix_ids(path, mapper):
    df = read_metadata_without_fields(path).replace(mapper)
    header = read_metadata_with_fields(path).iloc[0:4]
    return concat([header, df]).reset_index(drop=True) 

In [10]:
fix_ids(join(GCCRI_path, 'GCCRI_metadata-pdx_model.tsv'), mapper).to_csv(join(GCCRI_path, 'GCCRI_metadata-pdx_model.tsv'), sep='\t', index=False)

In [67]:
fix_ids(join(GCCRI_path, 'GCCRI_metadata-sharing.tsv'), mapper).to_csv(join(GCCRI_path, 'GCCRI_metadata-sharing.tsv'), sep='\t', index=False)

In [71]:
fix_ids(join(GCCRI_path, 'GCCRI_metadata-model_validation.tsv'), mapper).to_csv(join(GCCRI_path, 'GCCRI_metadata-model_validation.tsv'), sep='\t', index=False)

Unnamed: 0,Field,model_id,validation_technique,description,passages_tested,validation_host_strain_nomenclature
0,#Description,Unique identifier for all the PDXs derived fro...,Any technique used to validate PDX against the...,Short description of what was compared and wha...,Provide a list of all passages where validatio...,"Validation host mouse strain, following mouse ..."
1,#Example,CRC0228PR,fingerprinting,high concordance between xenograft and patien...,12,NOD.Cg-Prkdcscid Il2rgtm1Wjl/SzJ
2,#Format Requirements,free text,free text,free text,list of numbers separted by commas,full host strain name or Not provided
3,#Essential?,essential,essential,essential,essential,essential
4,,ASPSKY,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>
5,,BT27,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>
6,,BT29,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>
7,,ES1,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>
8,,ES4,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>
9,,ES6,Fingerprinting,Not provided,Not provided,C.B-Igh-1b/IcrTac-Prkdc<sup>scid</sup>


In [82]:
ps_mapper = dict(zip(list(pdx['model_id'] + '_P'), [n + '_P' for n in new_model_id]))
out = fix_ids(join(GCCRI_path, 'GCCRI_metadata-patient_sample.tsv'), mapper)
out.replace(ps_mapper).to_csv(join(GCCRI_path, 'GCCRI_metadata-patient_sample.tsv'), sep='\t', index=False)

In [83]:
out = fix_ids(join(GCCRI_path, 'GCCRI_molecular_metadata-sample.tsv'), mapper)
out.replace(ps_mapper).to_csv(join(GCCRI_path, 'GCCRI_molecular_metadata-sample.tsv'), sep='\t', index=False)

In [87]:
mut = read(join(GCCRI_path, 'mut/GCCRI_mut.tsv'), sep='\t').replace(ps_mapper)
mut.to_csv(join(GCCRI_path, 'mut/GCCRI_mut.tsv'), sep='\t', index=False)

In [85]:
mut

Unnamed: 0,sample_id,symbol,biotype,coding_sequence_change,variant_class,codon_change,amino_acid_change,consequence,functional_prediction,read_depth,...,seq_start_position,ref_allele,alt_allele,ucsc_gene_id,ncbi_gene_id,ncbi_transcript_id,ensembl_gene_id,ensembl_transcript_id,variation_id,platform_id
0,ASPS-KY_P,ACAD8,protein_coding,557A>G,SNV,aAt/aGt,N186S,missense_variant,,,...,134259074,A,G,,,,ENSG00000151498,ENST00000281182,rs200170162,mutation_WES
1,ASPS-KY_P,ACHE,protein_coding,827G>A,SNV,cGt/cAt,R276H,missense_variant,,,...,100893406,C,T,,,,ENSG00000087085,ENST00000241069,rs533264163,mutation_WES
2,ASPS-KY_P,ACOX3,protein_coding,1306C>T,SNV,Cgg/Tgg,R436W,missense_variant,,,...,8389729,G,A,,,,ENSG00000087008,ENST00000356406,rs146052311&COSV62712425,mutation_WES
3,ASPS-KY_P,ACTN1,protein_coding,1822C>T,SNV,Cgg/Tgg,R608W,missense_variant,,,...,68882589,G,A,,,,ENSG00000072110,ENST00000394419,rs1468329163&COSV99518838,mutation_WES
4,ASPS-KY_P,ADARB2,protein_coding,283G>T,SNV,Ggc/Tgc,G95C,missense_variant,,,...,1363822,C,A,,,,ENSG00000185736,ENST00000381312,rs748408152&COSV105323616&COSV67231399,mutation_WES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7002,TC-71_P,LRRC55,protein_coding,407C>T,SNV,gCc/gTc,A136V,missense_variant,,,...,57182429,C,T,,,,ENSG00000183908,ENST00000497933,COSV73006284,mutation_WES
7003,TC-71_P,ZNF728,protein_coding,143C>T,SNV,cCt/cTt,P48L,missense_variant,,,...,22987391,G,A,,,,ENSG00000269067,ENST00000594710,rs1157096293,mutation_WES
7004,TC-71_P,IL27RA,protein_coding,,SNV,,,downstream_gene_variant,,,...,14055064,G,A,,,,ENSG00000104998,ENST00000263379,rs201934041&COSV54602528,mutation_WES
7005,TC-71_P,FLG2,protein_coding,5830G>A,SNV,Ggg/Agg,G1944R,missense_variant,,,...,152351956,C,T,,,,ENSG00000143520,ENST00000388718,rs200946758,mutation_WES


In [89]:
treatment = read(join(GCCRI_path, 'treatment/GCCRI_patienttreatment-Sheet1.tsv'), sep='\t').replace(mapper)
treatment.to_csv(join(GCCRI_path, 'treatment/GCCRI_patienttreatment-Sheet1.tsv'), sep='\t', index=False)

In [102]:
exists(join(GCCRI_path, 'mut'))

True

In [27]:
def convert_to_int_or_blank(value):
    try:
        return int(float(value))
    except ValueError:
        return ''
    
exp = get_files(join(GCCRI_path, 'expression'))
exp = [f for f in exp if f.endswith('.tsv')]
for f in exp:
    temp = read(join(GCCRI_path, 'expression', f), sep='\t')
    new_f = mapper[f.split('_')[2].replace('.tsv', '')]
    new_f = 'GCCRI_expression_' + new_f + '.tsv'
    new_f = join(GCCRI_path, 'expression', new_f)
    temp['ensembl_gene_id'] = [convert_to_int_or_blank(p)  for p in temp['ensembl_gene_id']]
    temp.replace(ps_mapper).to_csv(new_f, sep='\t', index=False)
    remove(join(GCCRI_path, 'expression', f))

In [33]:
cna = get_files(join(GCCRI_path, 'cna'))
cna = [f for f in cna if f.endswith('.tsv')]
for f in cna:
    temp = read(join(GCCRI_path, 'cna', f), sep='\t')
    new_f = mapper[f.split('_')[2].replace('.tsv', '')]
    new_f = 'GCCRI_cna_' + new_f + '.tsv'
    new_f = join(GCCRI_path, 'cna', new_f)
    #temp['ensembl_gene_id'] = [convert_to_int_or_blank(p)  for p in temp['ensembl_gene_id']]
    temp.replace(ps_mapper).to_csv(new_f, sep='\t', index=False)
    remove(join(GCCRI_path, 'cna', f))

In [31]:
f

'GCCRI_cna_Rh-30R.tsv'