In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

In [None]:
#### Reverse negative strand coordinates in gene locations table ####
df_geneLoc = pd.read_csv('source/refGene_hg19_genebody_fixed.bed', sep='\t')
df_geneLoc.rename(columns={'chromosome':'chr', 'tss':'start', 'tes':'end'}, inplace=True)

for index, row in df_geneLoc.iterrows():
  if row.strand == '+':
    continue
    
  temp = row.start
  df_geneLoc.at[index,'start'] = row.end
  df_geneLoc.at[index, 'end'] = temp
  
df_geneLoc.to_csv('geneLoc.csv', index=False)

In [None]:
#### main function ####
from numpy import log

def combine_target_gene_info(gene_locations, cell_TPM, id_table):  
  for index, row in gene_locations.iterrows():
    gene_name = row.gene
    df_id_match = id_table[id_table['human.External.Gene.Name'] == gene_name]
    
    #check if gene name is in ID table
    if len(df_id_match) == 0:
      continue
    
    gene_id = df_id_match['human.Ensembl.Gene.ID'].iloc[0]
    gene_locations.at[index, 'Ensembl_ID'] = gene_id
    df_TPM_match = cell_TPM[cell_TPM['gene_id'].str.contains(gene_id)]
    
    #check if geneID is in TPM table
    if len(df_TPM_match) == 0:
      continue
    
    TPM = df_TPM_match['TPM'].iloc[0]
    gene_locations.at[index, 'lnTPM'] = log(TPM+1)

def main(cell_TPM_file, outfile):
  df_cellTPM = pd.read_csv(cell_TPM_file, sep='\t')

  df_geneID = pd.read_csv('geneID.csv')
  df_geneLoc = pd.read_csv('geneLoc.csv')
  df_geneLoc['Ensembl_ID'] = 'N/A'
  df_geneLoc['lnTPM'] = float('-inf')
  
  combine_target_gene_info(df_geneLoc, df_cellTPM, df_geneID)
  
  df_geneLoc.to_csv(outfile, index=False)

In [None]:
main('adrenal_gland_TPM.tsv', 'adrenal_gland_targetGene_all.csv')
main('HepG2_TPM.tsv', 'HepG2_targetGene_all.csv')
main('K562_TPM.tsv', 'K562_targetGene_all.csv')