Single-sample: Convert rsem_genes.results output to normalized unique TPM Hugo.
Uses [sample id].conf file to determine the paths to the sample directory & reference files

Last updated: May 2, 2018

Changelog:
2018-05-02 : Sum the TMP values, rather than take the mean, of combined duplicate genes See issue #130.
2017-12-05 : Average genes together in TPM space, not log2 TPM space; the latter is incorrect. (later changed to summing genes)

Input : 
 - conf.json
 - rsem_genes.results file for this sample: c["file"]["rsem_genes.results"]
 
Input reference files:
 - c["ref_file"]["rsem_genes.results_header"]
 - c["ref_file"]["ensembl_hugo_mapping_file"]
 - c["ref_file"]["ensembl_id_list"]

Output :
 - Files:
    - rsem.genes.tpm.hugo.tab
    - rsem.genes.tpm.hugo.log2plus1.dedupe.tab

 - JSON:
  - 1.json. Keys:
    - tpm_hugo_norm_uniq = rsem.genes.tpm.hugo.log2plus1.dedupe.tab
   

Process:

- TPM column extracted
- ensembl genes translated to hugo
 -  File written as rsem.genes.tpm.hugo.tab
- hugo genes summed and uniqued in TPM space
- log2(n+1) normalization applied
  - File written as rsem.genes.tpm.hugo.log2plus1.dedupe.tab


In [None]:
import pandas as pd
import numpy as np
import os
import csv
import json
import logging

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n1: Convert rsem_genes.results output to normalized unique TPM Hugo.")
def and_log(s):
    logging.info(s)
    return s
 
# JSON output
j={}

# Check input file
if(not os.path.exists(c["file"]["rsem_genes.results"])):
    print and_log("Error: can't find required input file {}".format(c["file"]["rsem_genes.results"]))
    raise KeyboardInterrupt

In [None]:

################### Base parameters ###########################

# strings
rgr_TPM_colname = "TPM"
ensembl_hugo_NA_key = "NA"

# Reference files 

# header entries from rsem_genes.results, 1 per line, first line is gene_id
rgr_header=pd.read_csv(c["ref_file"]["rsem_genes.results_header"], index_col=0).index
# First column: Hugo, ensembl_hugo_NA_key (ie, "NA") if none. Second column : ensembl ID

with open(c["ref_file"]["ensembl_hugo_mapping_file"], mode='r') as infile:
    ensembl_hugo_mapping_dict = dict((rows[1],rows[0]) for rows in csv.reader(infile, delimiter='\t'))


In [None]:
### Functions ###

# Takes : dataframe of tpm columns with ensembl labels
#         dictionary : mapping of ensembl->hugo genes
#         string : which value in the dictionary, after mapping, should be dropped.
# returns : dataframe. row labels are now hugo instead of ensembl
def ensembl_to_hugo(exp, mapping_dict, NA_key):
    new_index_exp = exp.copy()
    new_index_exp.index = exp.index.map(lambda x: mapping_dict[x])
    return new_index_exp.drop(NA_key)

# Takes : dataframe of TPM columns
# returns : dataframe. each column is normalized by log2(tpm + 1)
def normalize_log2(exp):
    return exp.apply(lambda x: np.log2(x+1))


# Takes : dataframe of TPM columns
# returns : dataframe. 
    # finds all row labels that are duplicate
    # sum their values together to create an unique row label
def uniquify_genes(exp):
    return exp.groupby(exp.index).sum()

In [None]:
### Setup ###

# Load all the TPM columns into a dataframe.
# Complain if there were any mismatch in rownames (resulting in NaN values)

# Ensembl IDs, 1 per line, first line is gene_id
ensembl_ids_df = pd.read_csv(c["ref_file"]["ensembl_id_list"], index_col=0) # Set up the ensembl IDs to use
tpm_arr = [ensembl_ids_df]

genesresults = pd.read_csv(c["file"]["rsem_genes.results"], delimiter="\t", index_col=0)
if(not genesresults.columns.equals(rgr_header)):
    raise Exception("{} headers don't match expected".format(sample_id))
else:
    tpm_arr.append(genesresults[rgr_TPM_colname].rename(sample_id))


### The initial dataframe of TPM columns ###
tpm_df = pd.concat(tpm_arr, axis=1)

if(tpm_df.isnull().values.any()):
    print and_log("Ensembl IDs didn't match! Please check input files")
    raise KeyboardInterrupt

In [None]:
### Main ###

# Convert ensembl gene names to hugo
tpm_hugo_df=ensembl_to_hugo(tpm_df, ensembl_hugo_mapping_dict, ensembl_hugo_NA_key)

# Make gene labels unique, averaging together the values from duplicate labels
tpm_hugo_uniq_df=uniquify_genes(tpm_hugo_df)

# Normalize by log_2(tpm+1)
tpm_hugo_norm_uniq_df=normalize_log2(tpm_hugo_uniq_df)

In [None]:
# Convert the dataframe to string-based so we can uniformly store it to JSON and CSV
tpm_hugo_norm_uniq_df_asString = tpm_hugo_norm_uniq_df.applymap(lambda x: "%.12g" % x)

# Store it in our JSON object
j["tpm_hugo_norm_uniq"] = json.loads(tpm_hugo_norm_uniq_df_asString.to_json(orient='columns'))



In [None]:
### Output ###

print("Writing TPM Hugo: {}".format(c["file"]["tpm_hugo"]))
print("Writing normalized unique TPM Hugo: {}".format(c["file"]["tpm_hugo_norm_uniq"]))
print("Writing JSON: {}".format(c["json"]["1"]))


tpm_hugo_df.to_csv(c["file"]["tpm_hugo"], sep="\t",
                            header=True, index_label="Gene",float_format="%.12g")


tpm_hugo_norm_uniq_df_asString.to_csv(c["file"]["tpm_hugo_norm_uniq"], sep="\t",
                            header=True, index_label="Gene")

with open(c["json"]["1"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)

print "Done!"