In [3]:
import os

import pandas as pd
import requests
from os import remove
import logging
from math import isnan
from tqdm import tqdm

log = logging.getLogger(__name__)
logging.basicConfig(filename='Fetch_RNASEQ_expression_PDMR.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')

In [2]:
def extract_ensembleid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[1] for x in row if "Ensembl" in x]
    if len(id)>0:
        return id[0]
    else:
        return ""

def extract_hgncid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[2] for x in row if "HGNC" in x]
    if len(id)>0:
        return "HGNC:"+str(id[0])
    else:
        return ""

def convert_cytoband2coord(row, cytobands):
    if not row.start > 0:
        sample = cytobands[cytobands.chromosome == "chr"+str(row.chromosome)]
        sample.cytoband = sample.chromosome.replace("chr", "", regex=True) + sample.cytoband
        if len(sample[sample.cytoband == row.map_location])==1:
                    row['start'] = sample[sample.cytoband == row.map_location]['start_pos'].reset_index(drop=True)[0]
                    row['end'] = sample[sample.cytoband == row.map_location]['end_pos'].reset_index(drop=True)[0]
                    row['strand'] = 1.0
    return row

def get_geneSymbol_locations():
    Reference = pd.read_json("/Users/tushar/Downloads/homo_sapiens_genes.json")
    Reference = Reference[["id","name", "seq_region_name", "strand", "start", "end", "coord_system", "synonyms"]]
    Reference = pd.concat([Reference, Reference.coord_system.apply(pd.Series)], axis=1).drop("coord_system", axis=1)
    Reference["symbol"] = Reference.iloc[:,1]
    Reference = Reference[Reference["symbol"].isna() == False]
    Reference = Reference[Reference["version"] == "GRCh38"]

    NCBI_ref = pd.read_csv("/Users/tushar/Downloads/Homo_sapiens.gene_info",sep='\t')
    NCBI_ref['ensembl_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_ensembleid_from_dbxrefs(x))
    NCBI_ref['hgnc_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_hgncid_from_dbxrefs(x))
    NCBI_ref = NCBI_ref[["Symbol", "Synonyms", "chromosome", "map_location", "GeneID", "ensembl_id"]]

    cyto2coordinates = pd.read_csv("/Users/tushar/pdx/update-data/cytoBand.txt", sep='\t', names=["chromosome", "start_pos", "end_pos", "cytoband", "info"])
    cyto2coordinates = cyto2coordinates[cyto2coordinates.cytoband.isna() == False]

    GeneSymbol_Locations = NCBI_ref.merge(Reference, left_on="ensembl_id", right_on="id", how="left").apply(convert_cytoband2coord, cytobands=cyto2coordinates, axis=1)
    GeneSymbol_Locations = GeneSymbol_Locations[["Symbol", "Synonyms", "chromosome", "strand", "start", "end", "GeneID", "ensembl_id"]]
    GeneSymbol_Locations = GeneSymbol_Locations[GeneSymbol_Locations.start.isna() ==False]
    GeneSymbol_Locations = GeneSymbol_Locations.drop_duplicates(subset=['Symbol'])
    return GeneSymbol_Locations

In [3]:
def get_location_from_synonym(row, GeneSymbol_Locations):
    if not isinstance(row['Symbol'], str) and isnan(row['Symbol']):
        pattern = "(^|\|)"+row["gene_id"]+"($|\|)"
        match = GeneSymbol_Locations.loc[GeneSymbol_Locations.Synonyms.str.contains(pattern)].reset_index(drop=True)
        if len(match) > 1:
            match = match.iloc[0]
        if len(match) == 1:
            row['Symbol'], row['chromosome'], row['strand'], row['start'], row['end'], row['GeneID'], row['ensembl_id'] = match['Symbol'][0], match['chromosome'][0], match['strand'][0], match['start'][0], match['end'][0], match['GeneID'][0], match['ensembl_id'][0]
    return row

def process_RSEM(sample_id, out_path, GeneSymbol_Locations):
    RSEM = pd.read_csv(out_path+"raw_exp/PDMR_expression_"+sample_id+".tsv", sep="\t")
    template = pd.read_csv("expression_template-sheet.tsv", sep="\t")

    #RSEM = RSEM.merge(GeneSymbol_Locations, how='left', left_on='gene_id', right_on='Symbol')
    #RSEM = RSEM.apply(get_location_from_synonym, GeneSymbol_Locations=GeneSymbol_Locations, axis=1)
    #RSEM = RSEM[['gene_id','expected_count','TPM','FPKM','chromosome','strand','start','end','GeneID','ensembl_id']].fillna('')

    template["symbol"] = RSEM["gene_id"]
    template["sample_id"] = sample_id
    template["rnaseq_tpm"], template["rnaseq_fpkm"], template["rnaseq_count"] = RSEM["TPM"], RSEM["FPKM"], RSEM["expected_count"]
    #template["strand"], template["ncbi_gene_id"], template["ensembl_gene_id"], template["chromosome"] = RSEM["strand"], RSEM["GeneID"], RSEM["ensembl_id"], RSEM["chromosome"]
    #template["seq_start_position"], template["seq_end_position"] = RSEM["start"], RSEM["end"]
    template["platform_id"] = "expression_RNASEQ_Illumina_NGS"
    #template.dropna(subset=['chromosome'], inplace=True)
    #template[template["chromosome"]!=""]
    template.to_csv(out_path+"expression/PDMR_expression_"+sample_id+".tsv", sep="\t", index=False)

def fetch_rnaseq_RSEM(URL, sample_id, out_path):
    PDMR_domain = "https://pdmdb.cancer.gov/"
    URL = PDMR_domain + URL #PDMR_rnaseq_sheet["RSEM(genes)"].iloc[0]
    response = requests.get(URL)
    open("gene_rnaseq.tsv", "wb").write(response.content)
    merge_raw_RSEM_files(sample_id, out_path)
    remove("gene_rnaseq.tsv")

def merge_raw_RSEM_files(sample_id, out_path):
    RSEM = pd.read_csv("gene_rnaseq.tsv", sep="\t")
    #final_df = pd.read_csv("gene_rnaseq_final.tsv", sep="\t")
    RSEM["sample_id"] = sample_id
    #final_df = final_df.append(RSEM[final_df.columns])
    RSEM[['sample_id', 'gene_id', 'TPM', 'FPKM', 'expected_count']].to_csv(out_path+"raw_exp/PDMR_expression_"+sample_id+".tsv", sep="\t", index=False)

def PDMR_RNASEQ(path, sample_path, mol_sample_path, out_path, GeneSymbol_Locations):
    PDMR_rnaseq_sheet = pd.read_csv(path, header=0)
    sample_sheet = pd.read_csv(sample_path, sep="\t")
    mol_sample_sheet = pd.read_csv(mol_sample_path, sep="\t")
    for i in tqdm(range(0, PDMR_rnaseq_sheet.shape[0])):
        URL = PDMR_rnaseq_sheet["RSEM(genes)"].iloc[i]
        sample_id = str(PDMR_rnaseq_sheet["Patient ID"].iloc[i])+"-"+str(PDMR_rnaseq_sheet["Specimen ID"].iloc[i])
        model_type = PDMR_rnaseq_sheet["PDM Type"].iloc[i]
        if model_type.__contains__("Organoid") or model_type.__contains__("PDC"):
            ps_id = sample_id+"-"+str(PDMR_rnaseq_sheet["Sample ID"].iloc[i])
        else:
            ps_id = sample_id
        if str(PDMR_rnaseq_sheet["Sample ID"].iloc[i]) != "ORIGINATOR":
            sample_id = sample_id+"-"+str(PDMR_rnaseq_sheet["Sample ID"].iloc[i])
        if ps_id in list(sample_sheet["sample_id"]):
            log.info("Fetching expression data for "+sample_id + ".")
            fetch_rnaseq_RSEM(URL, sample_id, out_path)
            process_RSEM(sample_id, out_path, GeneSymbol_Locations)
            mol_sample_sheet = add_to_mol_sample(sample_id, model_type, mol_sample_sheet)
    mol_sample_sheet.to_csv(out_path + "PDMR_molecular_metadata-sample.tsv", sep="\t", index=False)

def add_to_mol_sample(sample_id, model_type, df):
    # Field	model_id	sample_id	sample_origin	passage	host_strain_name	host_strain_nomenclature	engrafted_tumor_collection_site	raw_data_url	platform_id
    if model_type.__contains__("Organoid") or  model_type.__contains__("PDC"):
        sample_origin = "cell"
        model = sample_id
    elif model_type == "Patient/Originator Specimen":
        sample_origin = "patient"
        model = sample_id
    elif model_type == "PDX": 
        sample_origin = "xenograft"
        model = "-".join(sample_id.split("-")[:-1])
    platform = "expression_RNASEQ_Illumina_NGS"
    row = ["", model, sample_id, sample_origin, "", "", "", "", "", platform]
    return pd.concat([df, pd.DataFrame(row)]).reset_index(drop=True)

    

In [4]:
PDMR_rnaseq_sheet_path = 'rnasequence.csv'
sample_path = '/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/PDMR/PDMR_metadata-patient_sample.tsv'
mol_sample_path = '/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/PDMR/PDMR_molecular_metadata-sample.tsv'
out_path = "/Users/tushar/CancerModels/submission/PDMR/"
GeneSymbol_Locations = ""#get_geneSymbol_locations()
log.info("Fetching RNASeq RSEM data from PDMR...\n")

In [5]:
PDMR_RNASEQ(PDMR_rnaseq_sheet_path, sample_path, mol_sample_path, out_path, GeneSymbol_Locations)

100%|██████████| 5561/5561 [2:27:43<00:00,  1.59s/it]  


In [21]:
mol_sample_sheet = pd.read_csv(out_path + "PDMR_molecular_metadata-sample.tsv", sep="\t")
sample_passage = pd.read_csv('pdm_samples.csv')
sample_passage['sample_id'] = sample_passage['Patient ID']+'-'+sample_passage["Specimen ID"]+"-"+sample_passage['Sample ID']

In [22]:
def get_passage(row):
    if row['platform_id'] == "expression_RNASEQ_Illumina_NGS" and row['sample_origin']=="xenograft":
        row['passage'] = sample_passage[sample_passage['sample_id'] == row['sample_id']]['Passage'].reset_index(drop=True)[0]
        return row
    else:
        return row
mol_sample_sheet.apply(get_passage, axis=1).to_csv(out_path + "PDMR_molecular_metadata-sample.tsv", sep="\t", index=False)

In [26]:
mol_sample_sheet = pd.read_csv(out_path + "PDMR_molecular_metadata-sample.tsv", sep="\t")
rnaseq = pd.read_csv(PDMR_rnaseq_sheet_path)
rnaseq['sample_id'] = rnaseq['Patient ID']+'-'+rnaseq["Specimen ID"]+"-"+rnaseq['Sample ID']

In [27]:
def get_passage(row):
    if row['platform_id'] == "expression_RNASEQ_Illumina_NGS" and row['sample_origin']!="patient":
        row['raw_data_url'] = "https://pdmdb.cancer.gov/"+rnaseq[rnaseq['sample_id'] == row['sample_id']]['RSEM(genes)'].reset_index(drop=True)[0]
        return row
    else:
        return row
mol_sample_sheet.apply(get_passage, axis=1).to_csv(out_path + "PDMR_molecular_metadata-sample.tsv", sep="\t", index=False)

In [6]:
from utils import get_files, get_dirs
home = "/Users/tushar/CancerModels/pdxfinder-data/data/UPDOG/"

In [11]:
expression_files = get_files(os.path.join(home, 'PDMR', 'expression'))
template = pd.read_csv('expression_template-sheet.tsv', sep='\t')
for f in expression_files:
    df = pd.read_csv(os.path.join(home, 'PDMR', 'expression', f), sep='\t')[template.columns]
    df.to_csv(os.path.join(home, 'PDMR', 'expression', f), sep='\t', index=False)