In [2]:
import os

import pandas as pd
import requests
from os import remove
import logging
from math import isnan
from tqdm import tqdm

log = logging.getLogger(__name__)
logging.basicConfig(filename='Fetch_RNASEQ_expression_PDMR.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')

In [3]:
def extract_ensembleid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[1] for x in row if "Ensembl" in x]
    if len(id)>0:
        return id[0]
    else:
        return ""

def extract_hgncid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[2] for x in row if "HGNC" in x]
    if len(id)>0:
        return "HGNC:"+str(id[0])
    else:
        return ""

def convert_cytoband2coord(row, cytobands):
    if not row.start > 0:
        sample = cytobands[cytobands.chromosome == "chr"+str(row.chromosome)]
        sample.cytoband = sample.chromosome.replace("chr", "", regex=True) + sample.cytoband
        if len(sample[sample.cytoband == row.map_location])==1:
                    row['start'] = sample[sample.cytoband == row.map_location]['start_pos'].reset_index(drop=True)[0]
                    row['end'] = sample[sample.cytoband == row.map_location]['end_pos'].reset_index(drop=True)[0]
                    row['strand'] = 1.0
    return row

def get_geneSymbol_locations():
    Reference = pd.read_json("/Users/tushar/Downloads/homo_sapiens_genes.json")
    Reference = Reference[["id","name", "seq_region_name", "strand", "start", "end", "coord_system", "synonyms"]]
    Reference = pd.concat([Reference, Reference.coord_system.apply(pd.Series)], axis=1).drop("coord_system", axis=1)
    Reference["symbol"] = Reference.iloc[:,1]
    Reference = Reference[Reference["symbol"].isna() == False]
    Reference = Reference[Reference["version"] == "GRCh38"]

    NCBI_ref = pd.read_csv("/Users/tushar/Downloads/Homo_sapiens.gene_info",sep='\t')
    NCBI_ref['ensembl_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_ensembleid_from_dbxrefs(x))
    NCBI_ref['hgnc_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_hgncid_from_dbxrefs(x))
    NCBI_ref = NCBI_ref[["Symbol", "Synonyms", "chromosome", "map_location", "GeneID", "ensembl_id"]]

    cyto2coordinates = pd.read_csv("/Users/tushar/pdx/update-data/cytoBand.txt", sep='\t', names=["chromosome", "start_pos", "end_pos", "cytoband", "info"])
    cyto2coordinates = cyto2coordinates[cyto2coordinates.cytoband.isna() == False]

    GeneSymbol_Locations = NCBI_ref.merge(Reference, left_on="ensembl_id", right_on="id", how="left").apply(convert_cytoband2coord, cytobands=cyto2coordinates, axis=1)
    GeneSymbol_Locations = GeneSymbol_Locations[["Symbol", "Synonyms", "chromosome", "strand", "start", "end", "GeneID", "ensembl_id"]]
    GeneSymbol_Locations = GeneSymbol_Locations[GeneSymbol_Locations.start.isna() ==False]
    GeneSymbol_Locations = GeneSymbol_Locations.drop_duplicates(subset=['Symbol'])
    return GeneSymbol_Locations

In [6]:
def get_location_from_synonym(row, GeneSymbol_Locations):
    if not isinstance(row['Symbol'], str) and isnan(row['Symbol']):
        pattern = "(^|\|)"+row["gene_id"]+"($|\|)"
        match = GeneSymbol_Locations.loc[GeneSymbol_Locations.Synonyms.str.contains(pattern)].reset_index(drop=True)
        if len(match) > 1:
            match = match.iloc[0]
        if len(match) == 1:
            row['Symbol'], row['chromosome'], row['strand'], row['start'], row['end'], row['GeneID'], row['ensembl_id'] = match['Symbol'][0], match['chromosome'][0], match['strand'][0], match['start'][0], match['end'][0], match['GeneID'][0], match['ensembl_id'][0]
    return row

def process_RSEM(sample_id, GeneSymbol_Locations):
    RSEM = pd.read_csv("raw_exp/PDMR_expression_"+sample_id+".tsv", sep="\t")
    template = pd.read_csv("expression_template-sheet.tsv", sep="\t")

    RSEM = RSEM.merge(GeneSymbol_Locations, how='left', left_on='gene_id', right_on='Symbol')
    RSEM = RSEM.apply(get_location_from_synonym, GeneSymbol_Locations=GeneSymbol_Locations, axis=1)
    RSEM = RSEM[['gene_id','expected_count','TPM','FPKM','chromosome','strand','start','end','GeneID','ensembl_id']].fillna('')

    template["symbol"] = RSEM["gene_id"]
    template["sample_id"] = sample_id
    template["rnaseq_tpm"], template["rnaseq_fpkm"], template["rnaseq_count"], template["chromosome"] = RSEM["TPM"], RSEM["FPKM"], RSEM["expected_count"], RSEM["chromosome"]
    template["strand"], template["ncbi_gene_id"], template["ensembl_gene_id"] = RSEM["strand"], RSEM["GeneID"], RSEM["ensembl_id"]
    template["seq_start_position"], template["seq_end_position"] = RSEM["start"], RSEM["end"]
    template["platform_id"] = "expression_RNASEQ_Illumina_NGS"
    #template.dropna(subset=['chromosome'], inplace=True)
    template[template["chromosome"]!=""].to_csv("expression/PDMR_expression_"+sample_id+".tsv", sep="\t", index=False)

def fetch_rnaseq_RSEM(URL, sample_id, GeneSymbol_Locations):
    #PDMR_domain = "https://pdmdb.cancer.gov/"
    #URL = PDMR_domain + URL #PDMR_rnaseq_sheet["RSEM(genes)"].iloc[0]
    #response = requests.get(URL)
    #open("gene_rnaseq.tsv", "wb").write(response.content)
    #merge_raw_RSEM_files(sample_id)
    process_RSEM(sample_id, GeneSymbol_Locations)
    #remove("gene_rnaseq.tsv")

def merge_raw_RSEM_files(sample_id):
    RSEM = pd.read_csv("gene_rnaseq.tsv", sep="\t")
    #final_df = pd.read_csv("gene_rnaseq_final.tsv", sep="\t")
    RSEM["sample_id"] = sample_id
    #final_df = final_df.append(RSEM[final_df.columns])
    RSEM[['sample_id', 'gene_id', 'TPM', 'FPKM', 'expected_count']].to_csv("raw_exp/PDMR_expression_"+sample_id+".tsv", sep="\t", index=False)

def PDMR_RNASEQ(path, mol_meta_sample_path, GeneSymbol_Locations):
    PDMR_rnaseq_sheet = pd.read_csv(path, header=0)
    sample_sheet = pd.read_csv(mol_meta_sample_path, sep="\t")
    for i in tqdm(range(0, PDMR_rnaseq_sheet.shape[0])):
        URL = PDMR_rnaseq_sheet["RSEM(genes)"].iloc[i]
        sample_id = str(PDMR_rnaseq_sheet["Patient ID"].iloc[i])+"-"+str(PDMR_rnaseq_sheet["Specimen ID"].iloc[i])
        if str(PDMR_rnaseq_sheet["Sample ID"].iloc[i]) != "ORIGINATOR":
            sample_id = sample_id+"-"+str(PDMR_rnaseq_sheet["Sample ID"].iloc[i])
        sample_in_sample_sheet = sample_sheet[sample_sheet["sample_id"].str.contains(sample_id)]
        #sample_in_sample_sheet["platform_id"] = "expression_RNASEQ_Illumina_NGS"
        if len(sample_in_sample_sheet) > 0:
            #print("Fetching expression data for "+sample_id + ".")
            log.info("Fetching expression data for "+sample_id + ".")
            fetch_rnaseq_RSEM(URL, sample_id, GeneSymbol_Locations)
            #sample_sheet = sample_sheet.append(sample_in_sample_sheet)
            #sample_sheet.to_csv("PDMR_molecular_metadata-sample.tsv", sep="\t", index=False)

In [5]:
PDMR_rnaseq_sheet_path = '/Users/tushar/pdx/update-data/data-repo/PCMEXPORT_09062022/rnasequence.csv'
mol_meta_sample_path = '/Users/tushar/pdx/pdxfinder-data/data/UPDOG/PDMR/PDMR_molecular_metadata-sample.tsv'
GeneSymbol_Locations = get_geneSymbol_locations()
log.info("Fetching RNASeq RSEM data from PDMR...\n")

In [7]:
PDMR_RNASEQ(PDMR_rnaseq_sheet_path, mol_meta_sample_path, GeneSymbol_Locations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 4431/4431 [1:47:06<00:00,  1.45s/it]  


In [7]:
PDMR_RNASEQ(PDMR_rnaseq_sheet_path, mol_meta_sample_path, GeneSymbol_Locations)

  after removing the cwd from sys.path.
  0%|          | 22/4431 [1:10:24<235:09:12, 192.01s/it]


KeyboardInterrupt: 

In [9]:
import os

In [11]:
updog = "/Users/tushar/pdx/pdxfinder-data/data/UPDOG/"
len(os.listdir(updog))

28

In [28]:
contact_sheet = pd.DataFrame()
for provider in os.listdir(updog):
    if provider == ".DS_Store":
        continue
    contact = pd.read_csv(updog+provider+'/'+provider+"_metadata-sharing.tsv", sep='\t',skiprows=[1,2,3,4])
    contact['provider'] = provider
    contact = contact[['provider','email', 'name', 'europdx_access_modality']]
    contact_sheet = contact_sheet.append(contact)
contact_sheet

Unnamed: 0,provider,email,name,europdx_access_modality
0,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
1,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
2,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
3,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
4,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
...,...,...,...,...
634,IRCC-CRC,andrea.bertotti@ircc.it,andrea bertotti,transnational access
635,IRCC-CRC,andrea.bertotti@ircc.it,andrea bertotti,transnational access
636,IRCC-CRC,andrea.bertotti@ircc.it,andrea bertotti,transnational access
637,IRCC-CRC,andrea.bertotti@ircc.it,andrea bertotti,transnational access


In [29]:
contact_sheet.drop_duplicates()

Unnamed: 0,provider,email,name,europdx_access_modality
0,VHIO-PC,jarribas@vhio.net; earenas@vhio.net,Joaquin Arribas; Enrique Arenas,transnational access
0,HCMI,ocg@mail.nih.gov,,
0,UOM-BC,robert.clarke@manchester.ac.uk,Robert Clarke,transnational access
0,CMP,depmap@sanger.ac.uk,,
0,PIVOT,RLock@ccia.org.au,Richard Lock,
64,PIVOT,HoughtonP@uthscsa.edu,Peter J Houghton,
67,PIVOT,"patrick.reynolds@ttuhsc.edu,maris@email.chop.edu","Reynolds, Maris",
108,PIVOT,xli@luriechildrens.org,Li Xiao-Nan,
205,PIVOT,RGorlick@mdanderson.org,Richard Gorlick,
0,Curie-BC,elisabetta.marangoni@curie.fr,Elisabetta Marangoni,transnational access


In [32]:
contact_sheet.drop_duplicates().to_csv("Contact_sheet.tsv", sep='\t', index=False)