In [20]:
from os import listdir, getcwd, rename, makedirs
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging
from PIL import Image
from tqdm import tqdm

### Common functions

In [21]:
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]

def get_files(path):
    return [f for f in listdir(path) if isfile(join(path, f))]

def read_metadata_without_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    if 'Field' in metadata.columns:
        metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
        metadata = metadata.drop('Field', axis=1)
    return metadata

def read_metadata_with_fields(path):
    metadata = pd.read_csv(path, sep='\t', na_values="", low_memory=False)
    return metadata


def sort_case_insensitive(sort_list):
    return sorted(sort_list, key=str.casefold)


### Common env

In [55]:
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"
out_path = "/Users/tushar/CancerModels/utils/cbioportal/cbioportal-docker-compose/study/"
providers = sorted(get_dirs(home))

## Generate data for cBioPortal

In [68]:
def add_headers_clinical_patient():
    # Column headers - The attribute Display Names: The display name for each clinical attribute
    column_headers = ["#Patient Identifier", "Sex", "Diagnosis Age", "Overall Survival (Months)", "Overall Survival Status"]
    # Column descriptions - The attribute Descriptions: Long(er) description of each clinical attribute
    column_description = ["#Identifier to uniquely specify a patient.", "Sex", "Age at which a condition or disease was first diagnosed.", "Overall survival in months since initial diagonosis.", "Overall patient survival status."]
    # Column data type - The attribute Datatype: The datatype of each clinical attribute (must be one of: STRING, NUMBER, BOOLEAN)
    column_data_type = ["#STRING", "STRING", "NUMBER", "NUMBER", "STRING"]
    # Column priority - A number which indicates the importance of each attribute. A higher number indicates a higher priority.
    column_priority = ["#1", "1", "1", "1", "1"]
    # Column headers for validation
    column_header_validation = ["PATIENT_ID", "SEX", "AGE", "OS_MONTHS", "OS_STATUS"]
    return [column_headers, column_description, column_data_type, column_priority, column_header_validation]

def add_headers_clinical_sample():
    # Column headers - The attribute Display Names: The display name for each clinical attribute
    column_headers = ["#Patient Identifier", "Sample Identifier", "Tumor Type", "Cancer Type", "Cancer Type Detailed", "Primary site", "Tumor Grade"]
    # Column descriptions - The attribute Descriptions: Long(er) description of each clinical attribute
    column_description = ["#Identifier to uniquely specify a patient.", "A unique sample identifier.", "The type of tumour sample (i.e., normal, primary, met, recurrence).", "Cancer Type", "Cancer Type Detailed", "Site of the primary tumor where primary cancer is originating from (may not correspond to the site of the current tissue sample). No abbreviations.", "The implanded tumour grade value."]
    # Column data type - The attribute Datatype: The datatype of each clinical attribute (must be one of: STRING, NUMBER, BOOLEAN)
    column_data_type = ["#STRING", "STRING", "STRING", "STRING", "STRING", "STRING", "STRING"]
    # Column priority - A number which indicates the importance of each attribute. A higher number indicates a higher priority.
    column_priority = ["#1", "1", "1", "1", "1", "1", "1"]
    # Column headers for validation
    column_header_validation = ["PATIENT_ID", "SAMPLE_ID", "TUMOR_TYPE", "CANCER_TYPE", "CANCER_TYPE_DETAILED", "PRIMARY_SITE", "TUMOR_GRADE"]
    return [column_headers, column_description, column_data_type, column_priority, column_header_validation]

def generate_clinical_df(headers):
    # Generate the df
    c_bio_data_clinical = pd.DataFrame(columns = headers[0])
    c_bio_data_clinical.loc[0, :] = headers[1]
    c_bio_data_clinical.loc[1, :] = headers[2]
    c_bio_data_clinical.loc[2, :] = headers[3]
    c_bio_data_clinical.loc[3, :] = headers[4]
    return c_bio_data_clinical

def write_clinical_file(out_df, out_path, file_type):
    out_file = join(out_path, "data_clinical_"+file_type+".txt")
    out_df.to_csv(out_file, sep="\t", index=False)

def clinical_patient_data_transformation(data_path, provider):
    out_df = generate_clinical_df(add_headers_clinical_patient())
    data_path = join(data_path, provider+"_metadata-patient.tsv")
    
    mapper = {"patient_id": "#Patient Identifier", "sex": "Sex", "age_at_initial_diagnosis": "Diagnosis Age"}
    temp = read_metadata_without_fields(data_path).rename(columns = mapper)
    
    temp["Overall Survival (Months)"] = ""
    temp["Overall Survival Status"] = ""
    
    out_df = pd.concat([out_df, temp[out_df.columns]], ignore_index=True)
    return out_df.replace("Not collected", "").replace("Not Collected", "").replace("Not provided", "").replace("Not Provided", "")

def clinical_patient_sample_data_transformation(data_path, provider):
    out_df = generate_clinical_df(add_headers_clinical_sample())
    data_path = join(data_path, provider+"_metadata-patient_sample.tsv")
    
    mapper = {"patient_id":"#Patient Identifier", "sample_id":"Sample Identifier", "tumour_type":"Tumor Type", 
              "diagnosis":"Cancer Type Detailed", "primary_site": "Primary site", "grade":"Tumor Grade"}
    temp = read_metadata_without_fields(data_path).rename(columns = mapper)
    temp = temp.merge(get_cancer_system(provider), on="model_id", how="left").reset_index(drop=True)
    temp["Cancer Type"] = temp["cancer_system"]
    
    out_df = pd.concat([out_df, temp[out_df.columns]], ignore_index=True)
    return out_df.replace("Not collected", "").replace("Not Collected", "").replace("Not provided", "").replace("Not Provided", "").drop_duplicates(subset=["#Patient Identifier", "Sample Identifier"])

def get_cancer_system(provider_name):
    df = pd.read_json("https://www.cancermodels.org/api/model_metadata?data_source=eq."+provider_name)
    return df[["model_id", "cancer_system"]]

def generate_clinical_patient(in_path, out_path, provider):
    write_clinical_file(clinical_patient_data_transformation(in_path, provider), out_path, "patient")
    
def generate_clinical_sample(in_path, out_path, provider):
    write_clinical_file(clinical_patient_sample_data_transformation(in_path, provider), out_path, "sample")
    
## Molecular data
def generate_mutation_files(in_path, out_path):
    return None

def generate_meta_study_file(study, provider, out_path):
    meta_study_df = pd.DataFrame(columns=[0,1])
    meta_study_df.loc[0,:] = ["type_of_cancer:", "mixed"]
    meta_study_df.loc[1,:] = ["cancer_study_identifier:",  study]
    meta_study_df.loc[2,:] = ["name:", "Mixed PDX models from "+provider]
    meta_study_df.loc[3,:] = ["description:", "PDX models from "+provider+" for mixed cancer types."]
    meta_study_df.loc[4,:] = ["groups:", "PUBLIC"]
    #meta_study_df.loc[5,:] = ["short_name:", "PDCMs ("+provider+")"]
    meta_study_df.loc[5,:] = ["reference_genome:", "hg38"]
    meta_study_df.loc[6,:] = ["add_global_case_list:", "true"]
    meta_study_df.to_csv(join(out_path, "meta_study.txt"), sep="\t", index=False,header=False)
    
def generate_meta_clinical_files(study, out_path, type):
    meta_df = pd.DataFrame(columns=[0,1])
    meta_df.loc[0,:] = ["cancer_study_identifier:", study]
    meta_df.loc[1,:] = ["genetic_alteration_type:", "CLINICAL"]
    meta_df.loc[2,:] = ["datatype:", type.upper()+"_ATTRIBUTES"]
    meta_df.loc[3,:] = ["data_filename:", "data_clinical_"+type.upper()+".txt"]
    meta_df.to_csv(join(out_path, "meta_clinical_"+type+".txt"), sep="\t", index=False,header=False)

def generate_meta_files(out_path, provider):
    study = "cm_"+provider.lower()+"_2023"
    out_path = join(out_path, study)
    generate_meta_study_file(study, provider, out_path)
    generate_meta_clinical_files(study, out_path, "patient")
    generate_meta_clinical_files(study, out_path, "sample")
    return out_path
def generate_c_bio_portal_files(in_path, out_path, provider):
    out_path = generate_meta_files(out_path, provider)
    generate_clinical_patient(in_path, out_path, provider)
    generate_clinical_sample(in_path, out_path, provider)
    generate_mutation_files(in_path, out_path)

In [69]:
for i in tqdm(range(0, len(providers)), desc ="Generating cBioPortal data: "): ## get_dirs will get the provider dirs in updog
    provider = providers[i]
    if provider == "JAX":
        print("Working on provider: "+provider)
        generate_c_bio_portal_files(join(home, provider), out_path, provider) 

Generating cBioPortal data:   0%|          | 0/34 [00:00<?, ?it/s]

Working on provider: JAX


Generating cBioPortal data: 100%|██████████| 34/34 [00:02<00:00, 16.03it/s]
