In [1]:
import requests, sys
from os import listdir
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging
import pickle
from tqdm.notebook import tqdm, trange

log = logging.getLogger(__name__)
logging.basicConfig(filename='Lift_from_esembl_id.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')

In [11]:
def get_location_from_ENSEMBL(ensembl_id):
    ext = "/lookup/id/"+ensembl_id+"?"
    server = "https://rest.ensembl.org"
    try:
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

        if not r.ok:
          r.raise_for_status()
          sys.exit()

        decoded = r.json()
        #print(repr(decoded))
        if "species" in list(decoded.keys()) and decoded["species"] =="homo_sapiens" and "assembly_name" in list(decoded.keys()) and decoded['assembly_name'] == "GRCh38"\
                and 'seq_region_name' in list(decoded.keys()) and 'end' in list(decoded.keys()) and 'start' in list(decoded.keys()) and 'strand' in list(decoded.keys()):
            chromosome, start, end, strand = decoded["seq_region_name"], decoded["start"], decoded["end"], decoded["strand"]
            return list([ensembl_id, chromosome, start, end, strand])
        else:
            log.info("No coordinates for ID: "+ ensembl_id)
            return list([ensembl_id, "", "", "", ""])
    except:
        log.info("Request failed for ID: "+ensembl_id)
        return list([ensembl_id, "", "", "", ""])


In [19]:
home = "/Users/tushar/pdx/pdxfinder-data/data/UPDOG"

def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]
def get_files(path, files):
    [files.append(join(path, f)) if isfile(join(path, f)) else get_files(join(path,f), files) for f in listdir(path)]
    return files
def flatten_list(list_of_list):
    return [x2 for x1 in list_of_list for x2 in x1]

In [3]:
def get_gene_ids(path, total_gene_ids):
    tsv_files = [f for f in get_files(path, []) if f.endswith('.tsv')]
    if len(tsv_files)>0:
        for f in tsv_files:
            metadata = pd.read_csv(f, sep='\t', na_values="", low_memory=False).fillna('****')
            if 'Field' in metadata.columns:
                metadata = metadata.loc[metadata.Field.str.startswith('#') != True,].reset_index(drop=True)
                metadata = metadata.drop('Field', axis=1)
            total_gene_ids.append(list(pd.unique(metadata.ensembl_gene_id.to_numpy().flatten().astype(str))))
    return list(set(flatten_list(total_gene_ids)))

In [4]:
def construct_gene_ids(home):
    total_gene_ids = list()
    for provider in get_dirs(home): ## get_dirs will get the provider dirs in updog
        cna_path= join(home, provider, 'cna')
        if exists(cna_path):
            log.info("Working on provider: "+provider)
            total_gene_ids.append(get_gene_ids(cna_path, total_gene_ids))
    ensembl = "ENS"
    filtered_list = [sample.split(".")[0] for sample in list(set(flatten_list(total_gene_ids))) if ensembl in sample ]
    return list(set(filtered_list))

def convert_geneList_to_df(flat_gene_list):
    genomic_data = list()
    for ensembl_id in flat_gene_list:
        coordinates = get_location_from_ENSEMBL(ensembl_id)
        genomic_data.append(coordinates)
    return genomic_data

In [17]:
flat_gene_list = construct_gene_ids(home)
with open('/Users/tushar/pdx/update-data/outfile', 'wb') as fp:
    pickle.dump(flat_gene_list, fp)
#df = convert_geneList_to_df(flat_gene_list)

In [5]:
with open ('/Users/tushar/pdx/update-data/outfile', 'rb') as fp:
    flat_gene_list = pickle.load(fp)

In [2]:
Reference = pd.read_json("/Users/tushar/Downloads/homo_sapiens_genes.json")

In [15]:
Reference

Unnamed: 0,source,seq_region_synonyms,seq_region_name,analysis_display,strand,ensembl_object_type,id,transcripts,version,name,...,analysis,end,biotype,so_term,coord_system,xrefs,start,previous_ids,synonyms,is_haplotype
0,havana,"[{'db': 'INSDC', 'id': 'CM000682.2'}, {'db': '...",20,Havana,1,gene,ENSG00000275852,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",1,LINC01742,...,havana_homo_sapiens,58004648,lncRNA,ncRNA_gene,"{'seq_length': 64444167, 'version': 'GRCh38', ...","[{'display_id': 'ENSG00000275852', 'primary_id...",58003904,,,
1,ensembl_havana,"[{'db': 'INSDC', 'id': 'CM000685.2'}, {'db': '...",X,Ensembl/Havana merge,-1,gene,ENSG00000224440,"[{'previous_ids': ['ENST00000593386'], 'seq_re...",2,CXorf51A,...,ensembl_havana_gene_homo_sapiens,146814744,protein_coding,protein_coding_gene,"{'seq_length': 156040895, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000224440', 'primary_id...",146814106,[ENSG00000268774],[CXorf51],
2,havana,"[{'db': 'ensembl_internal_synonym', 'id': 'HSC...",CHR_HSCHR19KIR_ABC08_AB_HAP_T_P_CTG3_1,Havana,1,gene,ENSG00000276974,[{'seq_region_synonyms': [{'db': 'ensembl_inte...,1,,...,havana_homo_sapiens,54704235,unprocessed_pseudogene,pseudogene,"{'seq_length': 58617636, 'version': 'GRCh38', ...","[{'display_id': 'ENSG00000276974', 'primary_id...",54703952,,,1.0
3,havana,"[{'db': 'INSDC', 'id': 'CM000678.2'}, {'db': '...",16,Havana,-1,gene,ENSG00000261140,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",1,,...,havana_homo_sapiens,2571936,lncRNA,ncRNA_gene,"{'seq_length': 90338345, 'version': 'GRCh38', ...","[{'display_id': 'ENSG00000261140', 'primary_id...",2569043,,,
4,havana,"[{'db': 'INSDC', 'id': 'CM000664.2'}, {'db': '...",2,Havana,1,gene,ENSG00000236211,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",1,MTCO1P7,...,havana_homo_sapiens,130275449,unprocessed_pseudogene,pseudogene,"{'seq_length': 242193529, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000236211', 'primary_id...",130273908,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69335,havana,"[{'db': 'INSDC', 'id': 'CM000664.2'}, {'db': '...",2,Havana,1,gene,ENSG00000228784,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",9,LINC00954,...,havana_homo_sapiens,19885047,lncRNA,ncRNA_gene,"{'seq_length': 242193529, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000228784', 'primary_id...",19868860,,[FLJ12334],
69336,havana,"[{'db': 'INSDC', 'id': 'CM000664.2'}, {'db': '...",2,Havana,-1,gene,ENSG00000231336,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",1,,...,havana_homo_sapiens,46167978,lncRNA,ncRNA_gene,"{'seq_length': 242193529, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000231336', 'primary_id...",46166789,,,
69337,havana_tagene,"[{'db': 'INSDC', 'id': 'CM000669.2'}, {'db': '...",7,Havana TAGENE,-1,gene,ENSG00000287093,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",1,,...,havana_tagene_homo_sapiens,24795496,lncRNA,ncRNA_gene,"{'seq_length': 159345973, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000287093', 'primary_id...",24778444,,,
69338,ensembl_havana,"[{'db': 'INSDC', 'id': 'CM000671.2'}, {'db': '...",9,Ensembl/Havana merge,1,gene,ENSG00000119509,"[{'seq_region_synonyms': [{'db': 'INSDC', 'id'...",13,INVS,...,ensembl_havana_gene_homo_sapiens,100302175,protein_coding,protein_coding_gene,"{'seq_length': 138394717, 'version': 'GRCh38',...","[{'display_id': 'ENSG00000119509', 'primary_id...",100099243,,[NPHP2],


In [17]:
probesets = pd.read_json("/Users/tushar/pdx/update-data/homo_sapiens_probesets.json")

ValueError: Unmatched ''"' when when decoding 'string'

In [4]:
hgnc2ensembl = pd.read_csv("/Users/tushar/pdx/update-data/HGNC_2_ENSEMBL.txt", sep='\t')


In [14]:
NCBI_ref = pd.read_csv("/Users/tushar/Downloads/Homo_sapiens.gene_info",sep='\t')
NCBI_ref['ensembl_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_ensembleid_from_dbxrefs(x))
NCBI_ref['hgnc_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_hgncid_from_dbxrefs(x))
NCBI_ref

Unnamed: 0,tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type,ensembl_id,hgnc_id
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20220605,-,ENSG00000121410,HGNC:5
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20220605,-,ENSG00000175899,HGNC:7
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20220513,-,ENSG00000256069,HGNC:8
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20220522,-,ENSG00000171428,HGNC:7645
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20220522,-,ENSG00000156006,HGNC:7646


In [18]:
NCBI_ref

Unnamed: 0,tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type,ensembl_id,hgnc_id
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20220605,-,ENSG00000121410,HGNC:5
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20220605,-,ENSG00000175899,HGNC:7
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20220513,-,ENSG00000256069,HGNC:8
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20220522,-,ENSG00000171428,HGNC:7645
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20220522,-,ENSG00000156006,HGNC:7646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75486,741158,8923215,trnD,-,-,-,MT,-,tRNA-Asp,tRNA,-,-,-,-,20200909,-,,
75487,741158,8923216,trnP,-,-,-,MT,-,tRNA-Pro,tRNA,-,-,-,-,20200909,-,,
75488,741158,8923217,trnA,-,-,-,MT,-,tRNA-Ala,tRNA,-,-,-,-,20200909,-,,
75489,741158,8923218,COX1,-,-,-,MT,-,cytochrome c oxidase subunit I,protein-coding,-,-,-,cytochrome c oxidase subunit I,20200909,-,,


In [5]:
hgnc2ensembl.head()

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Status,Previous symbols,Alias symbols,Chromosome,Accession numbers,RefSeq IDs,Ensembl ID(supplied by Ensembl),NCBI Gene ID(supplied by NCBI),Ensembl gene ID,Locus type,Locus group,NCBI Gene ID
0,HGNC:2781,DFN8,"deafness, X-linked 8",Entry Withdrawn,,,reserved,,,,,,phenotype only,phenotype,
1,HGNC:48698,ENPP7P6,ectonucleotide pyrophosphatase/phosphodiestera...,Approved,,,8p23.1,,,ENSG00000255549,107133511.0,ENSG00000255549,pseudogene,pseudogene,
2,HGNC:2821,DFNB15,"symbol withdrawn, see [HGNC:18183](/data/gene-...",Symbol Withdrawn,,,3 or 19,,,,,,phenotype only,phenotype,
3,HGNC:12740,WBS2,Williams-Beuren syndrome type 2,Entry Withdrawn,WBS,WS,4q33-q35.1,,,,,,phenotype only,phenotype,
4,HGNC:109,ACHM1,"symbol withdrawn, see [HGNC:2153](/data/gene-s...",Symbol Withdrawn,RMCH,,14,,,,,,phenotype only,phenotype,


In [16]:
Reference[Reference.id == 'ENSG00000279719']['xrefs'].to_list()

[[{'display_id': 'ENSG00000279719',
   'primary_id': 'ENSG00000279719',
   'info_text': '',
   'info_type': 'DIRECT',
   'db_display': 'Expression Atlas',
   'description': None,
   'dbname': 'ArrayExpress'}]]

In [11]:
hgnc2ensembl = pd.read_csv("/Users/tushar/pdx/update-data/HGNC_2_ENSEMBL.txt", sep='\t')
hgnc2ensembl = hgnc2ensembl[['HGNC ID', "Approved symbol","Ensembl gene ID", "Status"]].rename({"HGNC ID":'hgnc_id',"Ensembl gene ID":'ensembl_id',"Approved symbol":'symbol'},axis=1)
hgnc2ensembl = hgnc2ensembl[hgnc2ensembl["Status"]=="Approved"]

In [12]:
def extract_ensembleid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[1] for x in row if "Ensembl" in x]
    if len(id)>0:
        return id[0]
    else:
        return ""
def extract_hgncid_from_dbxrefs(row):
    row = row.split("|")
    id = [x.split(":")[2] for x in row if "HGNC" in x]
    if len(id)>0:
        return "HGNC:"+str(id[0])
    else:
        return ""
def find_missing(row, hgnc2ensembl):
    if row.ensembl_id=="":
        match = hgnc2ensembl[hgnc2ensembl.hgnc_id==row.hgnc_id]
        if len(match)>0:
            row.ensembl_id = match.iloc[0,2]
    return row
NCBI_ref = pd.read_csv("/Users/tushar/Downloads/Homo_sapiens.gene_info",sep='\t')
NCBI_ref['ensembl_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_ensembleid_from_dbxrefs(x))
NCBI_ref['hgnc_id'] = NCBI_ref.dbXrefs.apply(lambda x: extract_hgncid_from_dbxrefs(x))
NCBI_ref = NCBI_ref.apply(find_missing, hgnc2ensembl=hgnc2ensembl, axis=1)

In [13]:
NCBI_ref.head()

Unnamed: 0,tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type,ensembl_id,hgnc_id
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20220605,-,ENSG00000121410,HGNC:5
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20220605,-,ENSG00000175899,HGNC:7
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069|AllianceGe...,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20220513,-,ENSG00000256069,HGNC:8
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20220522,-,ENSG00000171428,HGNC:7645
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156...,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20220522,-,ENSG00000156006,HGNC:7646


In [10]:
subset = Reference[["id","name", "seq_region_name", "strand", "start", "end", "coord_system", "synonyms"]]
subset = pd.concat([subset, subset.coord_system.apply(pd.Series)], axis=1).drop("coord_system", axis=1)
matched_genes = subset.loc[subset["id"].isin(flat_gene_list)]
ncbi_ensemble = NCBI_ref[["GeneID", "Symbol", "ensembl_id", "chromosome","map_location"]].rename({'ensembl_id':'id'}, axis=1).merge(subset, on='id', how='left')
#with open('/Users/tushar/pdx/update-data/outfile_json', 'wb') as fp:
#    pickle.dump(matched_genes, fp)

In [8]:
with open ('/Users/tushar/pdx/update-data/outfile_json', 'rb') as fp:
   matched_genes = pickle.load(fp)

In [9]:
def lift_genomic_coordinates(path, matched_genes):
    tsv_files = [f for f in get_files(path, []) if f.endswith('.tsv')]
    if len(tsv_files)>0:
        for f in tsv_files:
            metadata = pd.read_csv(f, sep='\t', na_values="", low_memory=False)
            cols = metadata.columns.insert(2,'strand')
            metadata['id'] = metadata.ensembl_gene_id.str.split('.', expand=True)[0].astype(str)
            merged = metadata.merge(matched_genes, on='id', how='left')
            merged[['chromosome', 'seq_start_position','seq_end_position']] = merged[['seq_name','start','end']]
            merged[cols].to_csv(f,sep='\t', index=False)

In [18]:
def find_gene_id(row, ncbi_ref):
    match = ncbi_ref[ncbi_ref.Symbol.str.lower() == row.loc['symbol'].lower()]
    if len(match)>0:
        row.ncbi_gene_id = match.iloc[0,1]
    else:
        pattern = "(^|\|)"+row.loc['symbol'].lower()+"($|\|)"
        synonym_match = ncbi_ref.loc[ncbi_ref['Synonyms'].str.lower().str.contains(pattern)]
        if len(synonym_match) == 1:
            row.ncbi_gene_id = synonym_match.iloc[0,1]
    return row

def find_ensembl_id_from_synonym(row, gene_synonyms):
    match = gene_synonyms[gene_synonyms.iloc[:,1].str.lower() == row.loc['symbol'].lower()]
    if len(match)>0:
        match = match.reset_index(drop=True)
        row.chromosome, row.seq_start_position,row.seq_end_position, row.ensembl_gene_id = match['seq_name'][0],match['start'][0],match['end'][0], match['id'][0]
    else:
        pattern = "(^|,)"+row.loc['symbol'].lower()+"($|,)"
        synonym_match = gene_synonyms.loc[gene_synonyms['synonyms'].str.lower().str.contains(pattern)]
        if len(synonym_match) == 1:
            synonym_match = synonym_match.reset_index(drop=True)
            row.chromosome, row.seq_start_position,row.seq_end_position, row.ensembl_gene_id = synonym_match['seq_name'][0],synonym_match['start'][0],synonym_match['end'][0], synonym_match['id'][0]
    return row

def lift_cna_data(file, ncbi_ensemble, gene_synonyms, log2):
    metadata = pd.read_csv(file, sep='\t', na_values="", low_memory=False)
    if 'strand' not in metadata.columns:
        cols = metadata.columns.insert(2,'strand')
    else:
        cols = metadata.columns
        metadata = metadata.rename({'strand':'strand_temp'},axis=1)
    if "PIVOT" in file:
        metadata['temp'] = metadata['ensembl_gene_id']
        metadata['ensembl_gene_id'] = metadata['ncbi_gene_id']
        metadata['ncbi_gene_id'] = metadata['temp']
    if len(metadata[metadata.ncbi_gene_id.isna()])>0:
        metadata[metadata.ncbi_gene_id.isna()] = metadata[metadata.ncbi_gene_id.isna()].apply(find_gene_id, ncbi_ref=NCBI_ref, axis=1)
    metadata['GeneID'] = metadata['ncbi_gene_id']
    merged = metadata.merge(ncbi_ensemble, on='GeneID', how='left')
    merged[['chromosome', 'seq_start_position','seq_end_position', 'ensembl_gene_id']] = merged[['seq_name','start','end', 'id']]
    missed = merged[['chromosome','seq_start_position']].isnull().any(axis=1)
    if len(merged[missed])>0:
        merged[missed] = merged[missed].apply(find_ensembl_id_from_synonym, gene_synonyms=gene_synonyms, axis=1)
    missed = merged[merged[['seq_region_name', 'start']].isnull().any(axis=1)].symbol
    if len(missed)>0:
        log2.info("Could not lift "+str(len(missed))+" out of "+str(len(merged))+" data points with gene symbol: [\'"+ "\', \'".join(missed)+'\']')
    merged[cols].to_csv(file,sep='\t', index=False)

def lift_genomic_coordinates_ncbi(path, ncbi_ensemble, gene_synonyms, log2):
    tsv_files = [f for f in get_files(path, []) if f.endswith('.tsv')]
    if len(tsv_files)>0:
        for f in tsv_files:
            if "PIVOT_cna_ALL-02.tsv" not in f and "PIVOT_cna_ALL-16.tsv" not in f:
                print("Lifting: "+f)
                log2.info("Lifting: "+f)
                lift_cna_data(f, ncbi_ensemble, gene_synonyms, log2)

In [86]:
provider_list = ["CRL"]
for provider in get_dirs(home): ## get_dirs will get the provider dirs in updog
    if provider in provider_list:
        cna_path= join(home, provider, 'cna')
        if exists(cna_path):
            log.info("Working on provider: "+provider)
            lift_genomic_coordinates(cna_path, matched_genes)

In [21]:
provider_list = ["IRCC-GC"]
gene_synonyms = subset[['id','name', 'synonyms', 'seq_name', 'strand', 'start', 'end']].dropna(axis=0).reset_index(drop=True)
gene_synonyms['synonyms'] = gene_synonyms.synonyms.apply(lambda x: ', '.join(x))
for provider in get_dirs(home): ## get_dirs will get the provider dirs in updog
    if provider in provider_list:
        log2 = logging.getLogger(__name__)
        logging.basicConfig(filename= provider+'_cna.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')
        log2.info("\n\nWorking on provider: "+provider)
        cna_path= join(home, provider, 'cna')
        if exists(cna_path):
            lift_genomic_coordinates_ncbi(cna_path, ncbi_ensemble, gene_synonyms, log2)

Lifting: /Users/tushar/pdx/pdxfinder-data/data/UPDOG/IRCC-GC/cna/IRCC-GC_cna.tsv


  import sys


In [75]:
cyto2coordinates = pd.read_csv("/Users/tushar/pdx/update-data/cytoBand.txt", sep='\t', names=["chromosome", "start_pos", "end_pos", "cytoband", "info"])
cyto2coordinates = cyto2coordinates[cyto2coordinates.cytoband.isna() == False]

def convert_cytoband2coord(row, cytobands):
    sample = cytobands[cytobands.chromosome == 'chr'+str(row.chromosome)]
    if len(sample[sample.cytoband == row.chromosome_band_end])==1 and len(sample[sample.cytoband == row.chromosome_band_start])==1:
        row['seq_start_position'], row['seq_end_position'] = sample[sample.cytoband == row.chromosome_band_start]['start_pos'].reset_index(drop=True)[0], sample[sample.cytoband == row.chromosome_band_end]['end_pos'].reset_index(drop=True)[0]
    return row

def convert_cytobands(file_path, cytobands, log3):
    metadata = pd.read_csv(file_path, sep='\t', na_values="", low_memory=False)
    if 'strand' not in metadata.columns:
        cols = metadata.columns.insert(2,'strand').drop(["chromosome_band_start", "chromosome_band_end"])
        metadata['strand'] = ''
    metadata = metadata.apply(convert_cytoband2coord, cytobands=cytobands, axis=1)
    missed = metadata[metadata[['chromosome','seq_start_position']].isnull().any(axis=1)]
    if len(missed)>0:
        log3.info("Could not lift "+str(len(missed))+" out of "+str(len(metadata))+" data points.")
    metadata[cols].to_csv(file_path,sep='\t', index=False)

def fetch_LIH(path, cytobands):
    log3 = logging.getLogger(__name__)
    logging.basicConfig(filename= 'LIH_cna.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')
    log3.info("\n\nWorking on provider: LIH")
    tsv_files = [f for f in get_files(path, []) if f.endswith('.tsv')]
    if len(tsv_files)>0:
        for f in tsv_files:
            print("Lifting: "+f)
            log2.info("Lifting: "+f)
            convert_cytobands(f, cytobands,log3)

LIH_path = '/Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/'
fetch_LIH(LIH_path, cyto2coordinates)

Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T361.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T407.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T188.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T158.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T239.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_P13.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T238.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T16.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T101.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_T470.tsv
Lifting: /Users/tushar/pdx/pdxfinder-data/data_quarantine/UPDOG/LIH/cna/LIH_cna_P3.tsv
Lifting: /Users/tushar/pd

Remove not lifted rows for PIVOT, PMLB, IRCC-GC

In [45]:
def drop_not_lifted_rows(path):
    tsv_files = [f for f in get_files(path, []) if f.endswith('.tsv')]
    missing = pd.DataFrame([])
    if len(tsv_files)>0:
        for f in tsv_files:
            metadata = pd.read_csv(f, sep='\t', na_values="", low_memory=False)
            temp = metadata[metadata.chromosome.isnull()]
            log.info("Dropped %s rows from %s" %(str(temp.shape[0]), f))
            metadata = metadata[metadata.chromosome.isnull() == False]
            metadata.to_csv(f,sep='\t', index=False)
            missing = missing.append(temp).reset_index(drop=True)
        missing.to_csv(join(path, '_dropped.tsv'),sep='\t', index=False)

In [46]:
provider_list = ["CRL", "PMLB", "PIVOT"]
for provider in get_dirs(home): ## get_dirs will get the provider dirs in updog
    if provider in provider_list:
        log.info("Provider: "+ provider+'\n')
        drop_not_lifted_rows(join(home, provider, 'cna'))