In [1]:
import pandas as pd
from utils import *

In [2]:
#https://ftp.ensembl.org/pub/current_json/homo_sapiens/homo_sapiens.json
start_dir = getcwd()
home = "/Users/tushar/CancerModels/pdxfinder-data/"
out_path = "/Users/tushar/CancerModels/submission/"
providers = sort_case_insensitive(get_dirs(home))

# Data from cBioPortal

In [3]:
def map_gene_to_location(row):
    symbol = row['symbol']
    match = geneSymbol_location[geneSymbol_location['Symbol'] == symbol].reset_index(drop=True)
    if match is not None and len(match)>0:
        if len(match) > 1:
            match = match.iloc[0]
        row['chromosome'], row['strand'], row['seq_start_position'], row['seq_end_position'], row['ncbi_gene_id'], row['ensembl_gene_id'] = match['chromosome'][0], match['strand'][0], match['start'][0], match['end'][0], match['GeneID'][0], match['ensembl_id'][0]
    else:
        row = get_location_from_synonym(row)
    return row

def process_cbioportal_files(params, desc):
    print(desc)
    home, in_path, out_path, study = params[0], params[1], params[2], params[3]
    if exists(in_path):
        dir_check(out_path)
    files = get_files(in_path)
    meta_study = pd.read_csv(join(in_path, 'meta_study.txt'), sep='\t')

    clinical = read_clinical_files(home, in_path, out_path, study)
    sample_ids = list(clinical['#Sample Identifier'].unique())
    read_mol_files(in_path, out_path, files, study, sample_ids)
        
    return None

def get_template(path):
    temp = read_metadata_with_fields(path)
    cols = temp.columns
    return temp, cols
    
def read_clinical_files(home, in_path, out_path, study):
    patient = read_metadata_with_fields(join(in_path, 'data_clinical_patient.txt'))
    sample = read_metadata_with_fields(join(in_path, 'data_clinical_sample.txt'))
    
    sample = sample[sample['Sample Class'] == 'Organoid']
    sample = sample[sample['#Sample Identifier'].str.contains('Organoid')].reset_index(drop=True)
    clinical = sample.merge(patient, left_on='Patient Identifier', right_on='#Patient Identifier', how='left').reset_index(drop=True)
    patient_ids = list(clinical['#Patient Identifier'].unique())
    clinical = clinical[clinical['#Patient Identifier'].isin(patient_ids)]
    
    generate_patient_sheet(clinical).to_csv(join(out_path, study+'_metadata-patient.tsv'), sep='\t', index=False)
    generate_patient_sample_sheet(clinical).to_csv(join(out_path, study+'_metadata-patient-sample.tsv'), sep='\t', index=False)
    generate_cell_sheet(clinical).to_csv(join(out_path, study+'_metadata-cell_model.tsv'), sep='\t', index=False)
    read_metadata_with_fields(join(home, "template/active_templates/metadata/metadata_template-model_validation.tsv")).to_csv(join(out_path, study+'_metadata-model_validation.tsv'), sep='\t', index=False)
    generate_sharing_sheet(clinical).to_csv(join(out_path, study+'_metadata-sharing.tsv'), sep='\t', index=False)
    return clinical

def generate_patient_sheet(p):
    p['patient_id'], p['sex'] = p['#Patient Identifier'], p['Sex'] 
    p['Field'], p['history'] = '', 'Smoking Status: ' + p['Smoking Status']
    p['ethnicity'], p['ethnicity_assessment_method'] = p['Ethinicity'].replace('African-American', 'African American'), 'Not provided'
    p['initial_diagnosis'], p['age_at_initial_diagnosis'] = 'Not provided', 'Not provided'
    temp_patient, cols = get_template(join(home, "template/active_templates/metadata/", "metadata_template-patient.tsv"))
    
    p = p[cols]
    p = pd.concat([temp_patient, p]).reset_index(drop=True)
    return p

def generate_patient_sample_sheet(ps):
    temp_ps, cols = get_template(join(home, "template/active_templates/metadata/", "metadata_template-patient_sample.tsv"))
    
    
    ps['Field'], ps['patient_id'], ps['sample_id'] = '', ps['Patient Identifier'], ps['#Sample Identifier']
    ps['age_in_years_at_collection'], ps['diagnosis'] = ps['Age'], ps['Cancer Type Detailed']
    ps['tumour_type'] = ps['Sample Type'].replace('Unknown_Tumor', 'Not provided')
    ps['primary_site'] = ps['Tissue Site'].fillna('Not provided').str.replace('_', ' ', regex=True)
    ps['collection_site'] = ps['primary_site']
    ps['stage'], ps['staging_system'] = ps['Tumor Stage'], 'TNM staging system'
    ps['grade'] = ps['Pathological Classification of Parental Tumor Sample']
    ps['sharable'] = 'Yes'
    ps['treatment_naive_at_collection'] = ['Yes' if len(x)<2 else 'No' for x in ps['Prior Intravesical Therapy'].fillna('N')]
    ps['treated_at_collection'] = 'No'
    ps['treated_prior_to_collection'] = ['Yes' if len(x)>2 else 'No' for x in ps['Prior Intravesical Therapy'].fillna('N')]
    empty_cols = ['collection_date', 'collection_event', 'months_since_collection_1', 'grading_system', 'virology_status']
    for col_name in empty_cols:
        ps[col_name] = 'Not provided'
    ps['model_id'] = ps['#Sample Identifier']
    
    ps = ps[cols]
    ps = pd.concat([temp_ps, ps]).reset_index(drop=True)
    return ps

def generate_cell_sheet(c):
    temp_c, cols = get_template(join(home, "template/active_templates/metadata/", "metadata_template-cell_model.tsv"))
    
    c['Field'], c["model_id"] = "", c['#Sample Identifier']
    c['type'], c["publications"] = "Organoid", "PMID: 29625057"
    c["comments"] = "Specimen Preservation Type: "+ c["Specimen Preservation Type"] + "; TMB (nonsynonymous): " + c["TMB (nonsynonymous)"]
    c["parent_id"], c["origin_patient_sample_id"] = "", ""
    empty_cols = ["name", "growth_properties", "supplier", "external_ids"]
    for col_name in empty_cols:
        c[col_name] = 'Not provided'
    
    c = c[cols]
    c = pd.concat([temp_c, c]).reset_index(drop=True)
    return c

def generate_sharing_sheet(s):
    contact = "mshen@columbia.edu"
    name = "Michael M. Shen"
    accessibility = "academia and industry"
    license = "CC0"
    db_url = "https://www.cuimc.columbia.edu/news/organoids-created-patients-bladder-cancers-could-guide-treatment#:~:text=Organoids%20created%20from%20the%20bladder,College%20of%20Physicians%20and%20Surgeons."
    
    temp_s, cols = get_template(join(home, "template/active_templates/metadata/", "metadata_template-sharing.tsv"))
    
    s['model_id'] = s['#Sample Identifier']
    s['accessibility'] = accessibility
    s['europdx_access_modality'] = ''
    s['email'] = contact
    s['name'] = name
    s['form_url'] = ''
    s['database_url'] = db_url
    s['license'] = license
    
    s = s[cols]
    s = pd.concat([temp_s, s]).reset_index(drop=True)
    return s     
        
def convert_matrix_to_df(df, ids, var_name, value_name):
    return pd.melt(df, id_vars=[ids], var_name=var_name, value_name=value_name)

def extract_mol(df, params, type, mol):
    if mol.__contains__("COPY"):
        if type == "DISCRETE":
            df = convert_matrix_to_df(df, "Hugo_Symbol", "sample_id", "gistic_value")
            template, col = get_template(join(home, "template/active_templates/cna/cna_template-sheet.tsv"))
            df['platform_id'] = "cna_WES_Illumina_2500"
            df['symbol'] = df['Hugo_Symbol']
            df = df[df['gistic_value'].fillna('') != '']
            df['gistic_value'] = df['gistic_value'].astype(int)
            #col = [x.replace("platform", "platform_id") for x in col]
            for col_name in col:
                if col_name not in df.columns:
                    # sample_id	chromosome	strand	seq_start_position	seq_end_position	symbol	ucsc_gene_id	ncbi_gene_id	ensembl_gene_id	log10r_cna	log2r_cna	fold_change	copy_number_status	gistic_value	picnic_value	platform_id
                    df[col_name] = ''
            df = df[col]
    elif mol.__contains__("MUTATION"):
        template, col = get_template(join(home, "template/active_templates/mut/mutation_template_external.tsv"))
        df['sample_id'] = df['Tumor_Sample_Barcode']
        df['symbol'] = df['Hugo_Symbol']
        df['read_depth'] = ''
        df['allele_frequency'] = ''
        df['chromosome'] = df['Chromosome']
        df['strand'] = df['Strand']
        df['seq_start_position'] = df['Start_Position']
        df['seq_end_position'] = df['End_Position']
        df['ref_allele'] = df['Reference_Allele']
        df['alt_allele'] = df['Tumor_Seq_Allele2']
        df['variation_id'] = df['dbSNP_RS']										
        df['platform_id'] = "mutation_WES_Illumina_2500"
        df = df[col]
    return df
        
def read_meta_files(file_path):
    # Define a function to parse the text and extract key-value pairs
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            if len(line.strip().split(': ')) > 2:
                l = line.strip().split(': ')
                key = l[0]
                value = ' '.join(l[1:])
            else:
                key, value = line.strip().split(': ')
            data[key] = value
    return data

def read_mol_files(in_path, out_path, files, study, sample_ids):
    mol_plat_sample, mol_cols = get_template(join(home, "template/active_templates/molecular_metadata/", "molecular_metadata-sample.tsv"))
    if 'meta_cna.txt' in files:
        meta = read_meta_files(join(in_path, 'meta_cna.txt'))
        data = pd.read_csv(join(in_path, 'data_cna.txt'), sep='\t')
        cna_type = meta["datatype"]
        mol_type = meta["genetic_alteration_type"]
        selected_columns = ['Hugo_Symbol'] + sample_ids
        data = data[selected_columns]
        data = extract_mol(data, [], cna_type, mol_type)
        data = data.apply(map_gene_to_location, axis=1)
        cna_path = join(out_path, "cna") 
        dir_check(cna_path)
        data.to_csv(join(cna_path, study+'_cna.tsv'), sep='\t', index=False)
        cna_plat = pd.DataFrame([["", samples, samples, "cell", "","","","","", "cna_WES_Illumina_2500"] for samples in sample_ids], columns=mol_cols)
        mol_plat_sample = pd.concat([mol_plat_sample, cna_plat]).reset_index(drop=True)
    if 'meta_mutations.txt':
        meta = read_meta_files(join(in_path, 'meta_mutations.txt'))
        data = pd.read_csv(join(in_path, 'data_mutations.txt'), sep='\t')
        data_type = meta["datatype"]
        mol_type = meta["genetic_alteration_type"]
        data = extract_mol(data, [], data_type, mol_type)
        data = data[data['sample_id'].isin(sample_ids)]
        mutation_path = join(out_path, "mut") 
        dir_check(mutation_path)
        data.to_csv(join(mutation_path, study+'_mut.tsv'), sep='\t', index=False)
        mut_plat = pd.DataFrame([["", samples, samples, "cell", "","","","","", "mutation_WES_Illumina_2500"] for samples in sample_ids], columns=mol_cols)
        mol_plat_sample = pd.concat([mol_plat_sample, mut_plat]).reset_index(drop=True)
    mol_plat_sample.to_csv(join(out_path, study+'_molecular_metadata-sample.tsv'), sep='\t', index=False)
    return None

# Data from Excel files

In [4]:
def process_excel_files():
    return None

# Main run

In [5]:
def main(params, run_type):
    if run_type == "cbio":
        func = process_cbioportal_files
        desc = "Processing cBioPortal files ..."
    elif run_type == "excel":
        func = process_excel_files
        desc = "Processing excel files ..."
    else:
        print("No run type found")
        return None
    func(params, desc)

## Bladder Columbia 2018 - Organoids
Columbia University Medical Center

In [6]:
input_path = "/Users/tushar/CancerModels/submission/bladder_columbia_msk_2018/"
output_path = "/Users/tushar/CancerModels/submission/"
study = "CUMC"
params = [home, input_path, join(output_path, study), study]
main(params, 'cbio')

Processing cBioPortal files ...


  match = geneSymbol_location.loc[geneSymbol_location.Synonyms.str.contains(pattern)].reset_index(drop=True)


BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap16INK4A
D17S250
D2S123
NR-21
NR-24
BAT-25
BAT-26
CDKN2Ap14ARF
CDKN2Ap

In [3]:
expression = pd.read_csv("/Users/tushar/Downloads/GSE103990_Normalized_counts.txt", sep='\t')
sample_ids = pd.read_csv(join(home, 'data/UPDOG/CUIMC/CUIMC_metadata-patient_sample.tsv'),sep='\t')

In [6]:
samples = list(sample_ids['sample_id'][4:])

In [7]:
df = pd.melt(expression, id_vars=['Unnamed: 0'], var_name='sample_id', value_name='rnaseq_count')

In [10]:
df['Unnamed: 0'].str.split('_')

['ENSG00000000003', 'TSPAN6']

In [11]:
df['ensembl_gene_id'] = [x.split('_')[0] for x in df['Unnamed: 0']]
df['symbol'] = [x.split('_')[1] for x in df['Unnamed: 0']] 

In [15]:
df[df['sample_id'].isin(samples)].reset_index(drop=True).to_csv('CUIMC_expression.tsv',sep='\t')

In [16]:
df[df['sample_id'].isin(samples)].sample_id.unique()

array(['SCBO-1_Organoid_P7', 'SCBO-9_Organoid_P0', 'SCBO-7_Organoid_P7',
       'SCBO-8_Organoid_P11', 'SMBO-1_Organoid_P0', 'SCBO-15_Organoid_P8',
       'SCBO-3_Organoid_P14', 'SCBO-7_2_Organoid_P1'], dtype=object)