In [38]:
from os import listdir, getcwd, rename, makedirs
from os.path import isfile, join, isdir, exists

import numpy as np
import pandas as pd
from tqdm import tqdm
import subprocess as sp

### Common functions

In [39]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [join(path, f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".tsv")]

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata


def sort_case_insensitive(sort_list):
    return sorted(sort_list, key=str.casefold)


### Common env

In [40]:
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"
out_path = "/Users/tushar/CancerModels/utils/cbioportal/pdcm-cbioportal/study/"
providers = sorted(get_dirs(home))

## Generate data for cBioPortal

### Clinical files

In [41]:
def add_headers_clinical_patient():
    # Column headers - The attribute Display Names: The display name for each clinical attribute
    column_headers = ["#Patient Identifier", "Sex", "Diagnosis Age", "Overall Survival (Months)", "Overall Survival Status"]
    # Column descriptions - The attribute Descriptions: Long(er) description of each clinical attribute
    column_description = ["#Identifier to uniquely specify a patient.", "Sex", "Age at which a condition or disease was first diagnosed.", "Overall survival in months since initial diagonosis.", "Overall patient survival status."]
    # Column data type - The attribute Datatype: The datatype of each clinical attribute (must be one of: STRING, NUMBER, BOOLEAN)
    column_data_type = ["#STRING", "STRING", "NUMBER", "NUMBER", "STRING"]
    # Column priority - A number which indicates the importance of each attribute. A higher number indicates a higher priority.
    column_priority = ["#1", "1", "1", "1", "1"]
    # Column headers for validation
    column_header_validation = ["PATIENT_ID", "SEX", "AGE", "OS_MONTHS", "OS_STATUS"]
    return [column_headers, column_description, column_data_type, column_priority, column_header_validation]

def add_headers_clinical_sample():
    # Column headers - The attribute Display Names: The display name for each clinical attribute
    column_headers = ["#Patient Identifier", "Sample Identifier", "Tumor Type", "Cancer Type", "Cancer Type Detailed", "Primary site", "Tumor Grade", "Model type", "Model ID"]
    # Column descriptions - The attribute Descriptions: Long(er) description of each clinical attribute
    column_description = ["#Identifier to uniquely specify a patient.", "A unique sample identifier.", "The type of tumour sample (i.e., normal, primary, met, recurrence).", "Cancer Type", "Cancer Type Detailed", "Site of the primary tumor where primary cancer is originating from (may not correspond to the site of the current tissue sample). No abbreviations.", "The implanded tumour grade value.", "Type of patient derived cancer model", "Unique identifier for the PDCMs"]
    # Column data type - The attribute Datatype: The datatype of each clinical attribute (must be one of: STRING, NUMBER, BOOLEAN)
    column_data_type = ["#STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING"]
    # Column priority - A number which indicates the importance of each attribute. A higher number indicates a higher priority.
    column_priority = ["#1", "1", "1", "1", "1", "1", "1", "1", "1"]
    # Column headers for validation
    column_header_validation = ["PATIENT_ID", "SAMPLE_ID", "TUMOR_TYPE", "CANCER_TYPE", "CANCER_TYPE_DETAILED", "PRIMARY_SITE", "TUMOR_GRADE", "MODEL_TYPE", "MODEL_ID"]
    return [column_headers, column_description, column_data_type, column_priority, column_header_validation]

def generate_clinical_df(headers):
    # Generate the df
    c_bio_data_clinical = pd.DataFrame(columns = headers[0])
    c_bio_data_clinical.loc[0, :] = headers[1]
    c_bio_data_clinical.loc[1, :] = headers[2]
    c_bio_data_clinical.loc[2, :] = headers[3]
    c_bio_data_clinical.loc[3, :] = headers[4]
    return c_bio_data_clinical

def write_clinical_file(out_df, out_path, file_type):
    out_file = join(out_path, "data_clinical_"+file_type+".txt")
    out_df.to_csv(out_file, sep="\t", index=False)

def clinical_patient_data_transformation(data_path, provider):
    out_df = generate_clinical_df(add_headers_clinical_patient())
    data_path = join(data_path, provider+"_metadata-patient.tsv")
    
    mapper = {"patient_id": "#Patient Identifier", "sex": "Sex", "age_at_initial_diagnosis": "Diagnosis Age"}
    temp = read_metadata_without_fields(data_path).rename(columns = mapper)
    
    temp["Overall Survival (Months)"] = ""
    temp["Overall Survival Status"] = ""
    
    out_df = pd.concat([out_df, temp[out_df.columns]], ignore_index=True)
    patient_ids = list(pd.read_csv(join(out_path, "data_clinical_sample.txt"), sep="\t")["#Patient Identifier"])
    out_df = out_df[out_df["#Patient Identifier"].isin(patient_ids)]
    return out_df.replace("(?i)Not collected", "", regex=True).replace("(?i)Not provided", "", regex=True)

def clinical_patient_sample_data_transformation(data_path, provider):
    out_df = generate_clinical_df(add_headers_clinical_sample())
    data_path = join(data_path, provider+"_metadata-patient_sample.tsv")
    
    mapper = {"patient_id":"#Patient Identifier", "sample_id":"Sample Identifier", "tumour_type":"Tumor Type OG", 
              "primary_site": "Primary site OG", "grade":"Tumor Grade"}
    temp = read_metadata_without_fields(data_path).rename(columns = mapper)
    temp = temp.merge(get_meta_from_api(provider), on="model_id", how="inner").reset_index(drop=True)
    temp["Cancer Type"] = temp["cancer_system"]
    temp["Cancer Type Detailed"] = temp["histology"]
    temp["Model type"] = temp["type"]
    temp["Tumor Type"] = temp["tumor_type"]
    temp["Primary site"] = temp["primary_site"]
    temp["Model ID"] = temp["model_id"]
    out_df = pd.concat([out_df, temp[out_df.columns]], ignore_index=True)
    return out_df.replace("(?i)not collected", "", regex=True).replace("(?i)not provided", "", regex=True).drop_duplicates(subset=["#Patient Identifier", "Sample Identifier"])

def get_meta_from_api(provider_name):
    df = pd.read_json("https://www.cancermodels.org/api/model_metadata?data_source=eq."+provider_name)
    return df[["model_id", "cancer_system", "histology", "type", "tumor_type", "primary_site"]]

def generate_clinical_patient(in_path, out_path, provider):
    write_clinical_file(clinical_patient_data_transformation(in_path, provider), out_path, "patient")
    
def generate_clinical_sample(in_path, out_path, provider):
    write_clinical_file(clinical_patient_sample_data_transformation(in_path, provider), out_path, "sample")

### Molecular data

In [42]:
def get_platform(path, type):
    df = read_metadata_without_fields(path).fillna("")
    df = df[df["molecular_characterisation_type"]==type]
    df["platform_name"] = df["library_strategy"]+ " -  " +df["instrument_model"]
    if path.__contains__("JAX"):
        if type=="mutation":
            return "WES"
        if type=="expression":
            return "RNA-Seq"
        if type=="copy number alteration":
            return "SNP"
    return ", ".join(df["platform_name"])

def read_mol_data(path):
    files = get_files(path)
    if len(files) == 0:
        dirs = get_dirs(path)
        for dir in dirs:
            files.append(get_files(join(path,dir)))
        files = [item for sublist in files for item in sublist]
    df = pd.DataFrame()
    for file in files:
        temp = pd.read_csv(file, sep="\t").fillna("")
        df = pd.concat([df, temp], ignore_index=True)
    return df

def compute_end_pos(df):
    df["End_Position"] = pd.to_numeric(df["Start_Position"]) + np.where(df.Reference_Allele.str.len()>=df.Tumor_Seq_Allele2.str.len(), df.Reference_Allele.str.len(), df.Tumor_Seq_Allele2.str.len())
    return df

def handle_frameshift(row):
    if not row["Consequence"].__contains__("frameshift"):
        return row
    variant = row["variant_class"]
    if variant.__contains__("frameshift"):
        if row["coding_sequence_change"].__contains__("ins"):
            variant = "insertion"
        elif row["coding_sequence_change"].__contains__("del"):
            variant = "deletion"
    row["Consequence"] = "frameshift_variation_" + variant
    return row

def generate_variation_classification(df):
    mapper = {"frameshift_variant_deletion":"Frame_Shift_Del", 
              "frameshift_variant_insertion":"Frame_Shift_Ins", 
              "inframe_deletion":"In_Frame_Del", 
              "inframe_insertion": "In_Frame_Ins", 
              "missense_variant": "Missense_Mutation", 
              "stop_gained": "Nonsense_Mutation", 
              "5_prime_UTR_variant": "5'UTR", 
              "upstream_gene_variant": "5'Flank", 
              "downstream_gene_variant": "3'Flank", 
              "3_prime_UTR_variant": "3'UTR", 
              "non_coding_transcript_variant": "RNA", 
              "intron_variant": "Intron", 
              "splice_region_variant": "Splice_Region", 
              "synonymous_variant": "Silent", 
              "stop_lost": "Nonstop_Mutation",
              "start_retained_variant": "Translation_Start_Site",
              "intergenic_variant": "IGR"}
    # Targeted_Region, De_novo_Start_InFrame, De_novo_Start_OutOfFrame, Splice_Site (dup) }
    #df["temp_cons"] = np.where(df["Consequence"].str.contains("frameshift"), df["Consequence"], df["Consequence"])
    df = df.apply(lambda x: handle_frameshift(x), axis=1)
    df["Variant_Classification"] = df["Consequence"].apply(lambda x: mapper.get(x, "Unknown"))
        #np.where(df["Consequence"].to_list() in mapper.keys(), df["Consequence"], "Unknown"))
    df["Variant_Classification"] = df["Variant_Classification"].replace(mapper)
    return df

def generate_meta_mutation(study, out_path, platform):
    meta_df = pd.DataFrame(columns=[0,1])
    meta_df.loc[0,:] = ["cancer_study_identifier:", study]
    meta_df.loc[1,:] = ["genetic_alteration_type:", "MUTATION_EXTENDED"]
    meta_df.loc[2,:] = ["datatype:", "MAF"]
    meta_df.loc[3,:] = ["stable_id:", "mutations"]
    meta_df.loc[4,:] = ["show_profile_in_analysis_tab:", "true"]
    meta_df.loc[5,:] = ["profile_description:", "Mutation data from "+platform]
    meta_df.loc[6,:] = ["profile_name:", "Mutations"]
    meta_df.loc[7,:] = ["data_filename:", "data_mutation_extended.txt"]
    meta_df.to_csv(join(out_path, "meta_mutation_extended.txt"), sep="\t", index=False,header=False)

def generate_mutation_data(path, out_path):
    mapper={"sample_id": "Tumor_Sample_Barcode", "seq_start_position": "Start_Position", "chromosome":"Chromosome",
            "ref_allele":"Reference_Allele", "alt_allele": "Tumor_Seq_Allele2", "symbol":"Hugo_Symbol",
            "consequence":"Consequence", "amino_acid_change":"HGVSp_Short", 
            "ensembl_gene_id": "Gene","ensembl_transcript_id":"Transcript_ID", "codon_change": "Codons"}
    mut_df = read_mol_data(path).rename(columns = mapper)
    out_column = ["Tumor_Sample_Barcode","Hugo_Symbol", "HGVSp_Short", "Consequence", "Variant_Classification", "Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Gene", "Transcript_ID","NCBI_Build", "Codons", "Variant_Type"]
    mut_df = compute_end_pos(mut_df)
    mut_df["NCBI_Build"] = "GRCh38"
    mut_df["HGVSp_Short"] = mut_df["HGVSp_Short"].str.replace('.0', '')
    mut_df = generate_variation_classification(mut_df)
    sample_ids = list(pd.read_csv(join(out_path, "data_clinical_sample.txt"), sep="\t")["Sample Identifier"])
    mut_df = mut_df[mut_df["Tumor_Sample_Barcode"].isin(sample_ids)]
    mut_df["Variant_Type"] = mut_df["variant_class"].replace("SNV", "SNP").replace("insertion", "INS").replace("deletion", "DEL")
    mut_df[out_column].to_csv(join(out_path, "data_mutation_extended.txt"), index=False, sep="\t")
    #sp.call("docker run -v "+out_path+":/wd genomenexus/gn-annotation-pipeline:master --filename /wd/data_mutation_extended_raw.txt  --output-filename /wd/data_mutation_extended.txt")

def generate_meta_cna(study, out_path, platform):
    meta_df = pd.DataFrame(columns=[0,1])
    meta_df.loc[0,:] = ["cancer_study_identifier:", study]
    meta_df.loc[1,:] = ["genetic_alteration_type:", "COPY_NUMBER_ALTERATION"]
    meta_df.loc[2,:] = ["datatype:", "LOG2-VALUE"]
    meta_df.loc[3,:] = ["stable_id:", "log2CNA"]        
    meta_df.loc[4,:] = ["show_profile_in_analysis_tab:", "false"]
    meta_df.loc[5,:] = ["profile_description:", "Log2 copy-number data from "+platform]
    meta_df.loc[6,:] = ["profile_name:", "Log2 copy-number values"]
    meta_df.loc[7,:] = ["data_filename:", "data_log2_cna.txt"]
    meta_df.to_csv(join(out_path, "meta_log2_cna.txt"), sep="\t", index=False,header=False)

def generate_cna_data(path, out_path):
    df = read_mol_data(path)[["sample_id", "symbol", "log2r_cna"]]
    sample_ids = list(pd.read_csv(join(out_path, "data_clinical_sample.txt"), sep="\t")["Sample Identifier"])
    df = df[df["sample_id"].isin(sample_ids)]
    df['Hugo_Symbol'] = df['symbol']
    df = df.pivot_table(index='Hugo_Symbol', columns='sample_id', values="log2r_cna", aggfunc='first')
    df.to_csv(join(out_path, "data_log2_cna.txt"), sep="\t")

def generate_meta_expression(study, out_path, platform, datatype):
    meta_df = pd.DataFrame(columns=[0,1])
    meta_df.loc[0,:] = ["cancer_study_identifier:", study]
    meta_df.loc[1,:] = ["genetic_alteration_type:", "MRNA_EXPRESSION"]
    if datatype=="mrna":
        meta_df.loc[2,:] = ["datatype:", "CONTINUOUS"]
        meta_df.loc[3,:] = ["stable_id:", "rna_seq_mrna"]
    elif datatype=="Zscore":
        meta_df.loc[2,:] = ["datatype:", "Z-SCORE"]
        meta_df.loc[3,:] = ["stable_id:", "mrna_median_Zscores"]        
    meta_df.loc[4,:] = ["show_profile_in_analysis_tab:", "true"]
    meta_df.loc[5,:] = ["profile_description:", "Expression data from "+platform]
    meta_df.loc[6,:] = ["profile_name:", "Expression"]
    meta_df.loc[7,:] = ["data_filename:", "data_expression.txt"]
    meta_df.to_csv(join(out_path, "meta_expression.txt"), sep="\t", index=False,header=False)

def generate_expression_data(path, out_path, platform, study):
    df = read_mol_data(path)[["sample_id", "symbol", "rnaseq_fpkm", "z_score"]]
    sample_ids = list(pd.read_csv(join(out_path, "data_clinical_sample.txt"), sep="\t")["Sample Identifier"])
    df = df[df["sample_id"].isin(sample_ids)]
    df["Hugo_Symbol"] = df["symbol"]
    no_fpkm = df['rnaseq_fpkm'].isna().all()
    datatype="mrna"
    value_column = "rnaseq_fpkm"
    if no_fpkm:
        print("Using Z score: "+path)
        datatype = "Zscore"
        value_column = "z_score"
    df = df[["sample_id", "Hugo_Symbol", value_column]]
    df = df.pivot_table(index='Hugo_Symbol', columns='sample_id', values=value_column, aggfunc='first')
    generate_meta_expression(study, out_path, platform, datatype)
    df.to_csv(join(out_path, "data_expression.txt"), sep="\t")
    
def generate_cna_files(in_path, study, provider, out_path):
    platform = get_platform(join(in_path, provider+"_molecular_metadata-platform.tsv"), "mutation")
    generate_meta_cna(study, out_path, platform)
    generate_cna_data(join(in_path, "cna"), out_path)
def generate_mutation_files(in_path, study, provider, out_path):
    platform = get_platform(join(in_path, provider+"_molecular_metadata-platform.tsv"), "mutation")
    generate_meta_mutation(study, out_path, platform)
    generate_mutation_data(join(in_path, "mut"), out_path)
def generate_expression_files(in_path, study, provider, out_path):
    platform = get_platform(join(in_path, provider+"_molecular_metadata-platform.tsv"), "expression")
    generate_expression_data(join(in_path, "expression"), out_path, platform, study)

### Timeline data

In [None]:
def generate_timeline_data(in_path, out_path, study):
    return None

### Meta files

In [43]:
def get_provider_description(provider_name):
    if provider_name=="DFCI-CPDM":
        return "The Center for Patient Derived Models (CPDM) at Dana-Farber  Cancer Institute (DFCI) is a strategic collaborative research center with the expertise  to generate and characterize patient derived xenografts (PDX), patient derived cell  lines (PDCL - 3D organoid/spheroid and 2D adherent cultures), and acute cell models  drug testing. Through collaboration with major disease groups in the Dana-Farber Cancer Institute, Brigham and Women's Hospital, and Boston Children Hospital Cancer Centers, we have made a large collection of patient derived models of brain tumors, hematologic tumors, and many other solid tumors available to academic and industrial researchers worldwide."
    df = pd.read_json("https://www.cancermodels.org/api/provider_group?abbreviation=eq."+provider_name)    
    description = str(df["description"][0]).replace("\n\n", " ").replace("\n", " ")
    if len(description)>=1024:
        description = description[0:1020]
    return description

def generate_meta_study_file(study, provider, out_path):
    meta_study_df = pd.DataFrame(columns=[0,1])
    meta_study_df.loc[0,:] = ["type_of_cancer:", "mixed"]
    meta_study_df.loc[1,:] = ["cancer_study_identifier:",  study]
    meta_study_df.loc[2,:] = ["name:", provider + " - Patient derived cancer models"]
    meta_study_df.loc[3,:] = ["description:", get_provider_description(provider)]
    meta_study_df.loc[4,:] = ["groups:", "PUBLIC"]
    #meta_study_df.loc[5,:] = ["short_name:", "PDCMs ("+provider+")"]
    meta_study_df.loc[5,:] = ["reference_genome:", "hg38"]
    meta_study_df.loc[6,:] = ["add_global_case_list:", "true"]
    meta_study_df.to_csv(join(out_path, "meta_study.txt"), sep="\t", index=False,header=False)
    
def generate_meta_clinical_files(study, out_path, type):
    meta_df = pd.DataFrame(columns=[0,1])
    meta_df.loc[0,:] = ["cancer_study_identifier:", study]
    meta_df.loc[1,:] = ["genetic_alteration_type:", "CLINICAL"]
    meta_df.loc[2,:] = ["datatype:", type.upper()+"_ATTRIBUTES"]
    meta_df.loc[3,:] = ["data_filename:", "data_clinical_"+type+".txt"]
    meta_df.to_csv(join(out_path, "meta_clinical_"+type+".txt"), sep="\t", index=False,header=False)

def generate_meta_files(out_path, provider, study):
    generate_meta_study_file(study, provider, out_path)
    generate_meta_clinical_files(study, out_path, "sample")
    generate_meta_clinical_files(study, out_path, "patient")

### main()

In [44]:
def generate_c_bio_portal_files(in_path, out_path, provider):
    study = "cm_"+provider.lower()+"_2023"
    out_path = join(out_path, study)
    if not exists(out_path):
        makedirs(out_path)
    generate_meta_files(out_path, provider, study)
    generate_clinical_patient(in_path, out_path, provider)
    generate_clinical_sample(in_path, out_path, provider)
    if exists(join(in_path, "mut")):
        generate_mutation_files(in_path, study, provider, out_path)
    if exists(join(in_path, "expression")):
        generate_expression_files(in_path, study, provider, out_path)
    if exists(join(in_path, "cna")):
        generate_cna_files(in_path, study, provider, out_path)

### Per provider

In [45]:
for i in tqdm(range(0, len(providers)), desc ="Generating cBioPortal data: "): ## get_dirs will get the provider dirs in updog
    provider = providers[i]
    if provider!="CRL" and not provider.__contains__("Curie"):
        generate_c_bio_portal_files(join(home, provider), out_path, provider) 

  temp = pd.read_csv(file, sep="\t").fillna("")
Generating cBioPortal data:  38%|███▊      | 13/34 [1:40:41<1:32:34, 264.50s/it] 

Using Z score: /Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/IRCC-CRC/expression


Generating cBioPortal data:  41%|████      | 14/34 [1:40:43<1:07:01, 201.07s/it]

Using Z score: /Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/JAX/expression


Generating cBioPortal data:  44%|████▍     | 15/34 [1:41:30<51:03, 161.24s/it]  

Using Z score: /Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/LIH/expression


  temp = pd.read_csv(file, sep="\t").fillna("")
  temp = pd.read_csv(file, sep="\t").fillna("")
  temp = pd.read_csv(file, sep="\t").fillna("")
  temp = pd.read_csv(file, sep="\t").fillna("")
Generating cBioPortal data:  85%|████████▌ | 29/34 [1:45:20<00:42,  8.58s/it]

Using Z score: /Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/UOM-BC/expression


Generating cBioPortal data: 100%|██████████| 34/34 [1:45:27<00:00, 186.12s/it]


In [46]:
'''
for i in range(0, len(providers)): ## get_dirs will get the provider dirs in updog
    provider = providers[i]
    print(provider)
    path = join(home, provider, provider+"_molecular_metadata-platform.tsv")
    if exists(path):
        type = "expression"
        print("Expression:")
        print(get_platform(path, type))
        type = "mutation"
        print("Mutation:")
        print(get_platform(path, type))
        type = "copy number alteration"
        print("copy number alteration:")
        print(get_platform(path, type))
'''

'\nfor i in range(0, len(providers)): ## get_dirs will get the provider dirs in updog\n    provider = providers[i]\n    print(provider)\n    path = join(home, provider, provider+"_molecular_metadata-platform.tsv")\n    if exists(path):\n        type = "expression"\n        print("Expression:")\n        print(get_platform(path, type))\n        type = "mutation"\n        print("Mutation:")\n        print(get_platform(path, type))\n        type = "copy number alteration"\n        print("copy number alteration:")\n        print(get_platform(path, type))\n'

In [47]:
provider

'Wistar-MDAnderson-Penn'