In [2]:
import requests, sys
from os import listdir
from os.path import isfile, join, isdir, exists
import pandas as pd
import logging
import pickle
from tqdm.notebook import tqdm, trange

log = logging.getLogger(__name__)
logging.basicConfig(filename='CRL_expression_map_symbols_to_probe_id.log', level=logging.INFO, format='%(levelname)s:%(asctime)s: %(message)s', datefmt='%d/%m/%Y %I:%M %p')

In [3]:
home = "/Users/tushar/pdx/update-data/data-repo/CRL_dropped_rows/CRL/expression/microarray/"
template = "/Users/tushar/pdx/pdxfinder-data/template/active_templates/expression/expression_template-sheet.tsv"
template=pd.read_csv(template, sep='\t', na_values="", low_memory=False).fillna("").rename({"platform":"platform_id"},axis=1)
lookup_table = "/Users/tushar/pdx/update-data/data-repo/CRL_dropped_rows/Probe_symbol_lookup_table.tsv"
lookup_table = pd.read_csv(lookup_table, sep='\t', na_values="", low_memory=False).fillna("")
def get_dirs(path):
    return [f for f in listdir(path) if isdir(join(path, f))]
def get_files(path, files):
    [files.append(join(path, f)) if isfile(join(path, f)) else get_files(join(path,f), files) for f in listdir(path)]
    return files
def flatten_list(list_of_list):
    return [x2 for x1 in list_of_list for x2 in x1]

In [4]:
tsv_files = [f for f in get_files(home, []) if f.endswith('.tsv')]

In [5]:
def preprocess_file(f, template):
    data = pd.read_csv(f, sep='\t', na_values="").rename({"hgnc_symbol":"symbol",'transcript_id':'ensembl_transcript_id'},axis=1).fillna("")
    data["platform_id"] = "expression_Affymetrix_HGU133plus2"
    data = data[template.columns]
    data['chromosome'] = data.chromosome.replace({"chr":""},regex=True)
    return data
def match_with_lookup_table(row, lookup):
    if row.symbol=="":
        match=lookup_table[lookup_table.PROBEID==row.affy_hgea_probe_id]
        if len(match)>0:
            row.symbol=match.iloc[0,2]
            row.ensembl_gene_id=match.iloc[0,1]
    return row

In [6]:
log.info("Find missing symbol")
for f in tsv_files:
    log.info("Working on %s" %f)
    data = preprocess_file(f, template)
    row_num_before = len(data)
    data = data.apply(match_with_lookup_table, lookup=lookup_table,axis=1)
    data = data[data.symbol!=""]
    data.to_csv(f,sep='\t', index=False)
    row_num_after = len(data)
    log.info("Total rows %s and number of probe ids with gene symbol %s" %(row_num_before, row_num_after))