# Automated Lead Identification and Druggable Genes & Pathways


Original file:
 afa61d0b064d367898c682970de27b30  identify_leads.R
 from: https://github.com/UCSC-Treehouse/analysis-methods/commit/bf6d5c5e4cfa4db4580203656b4120bda473000e
 
 That file can be found in the rscript dir along with the version of it as embedded in this script, identify_leads.R.ipynb_version .

Calculates automated leads.
Also calculates the count of druggable genes and pathways for the Outlier Analysis Findings table.

#### Input

- step 5 : genes pc and pd up
- step 7 - geneset aggregation and geneset details per list

#### Output

- *automatedLeadsIdentified.tsv* -- a document listing potential leads
- json document containing:
  - automated_leads_identified
  - druggable_genes_count
  - druggable_pathways_count



In [None]:
import os
import json
import tempfile
import pandas as pd
from collections import OrderedDict
import logging

# Setup: load conf, retrieve sample ID, logging
with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))

logging.basicConfig(**c["info"]["logging_config"])
logging.info("\n8: Identify Automated Leads")
def and_log(s):
    logging.info(s)
    return s

    
# Load arguments - reference files
curated_pathways_druggable_genes=c["ref_file"]["curated_pathways_druggable_genes"]
druggable_genes_by_category=c["ref_file"]["druggable_genes_by_category"]

# Input json - steps 5 and 7

with open(c["json"]["5"],"r") as jf:
    j5 = json.load(jf, object_pairs_hook=OrderedDict)

with open(c["json"]["7"],"r") as jf:
    j7 = json.load(jf, object_pairs_hook=OrderedDict)    
    
    
# Output json & file
j = {}

output=c["file"]["automated_leads_identified"]

print("\nUsing reference files:\n    {}\n    {}".format(
    curated_pathways_druggable_genes, druggable_genes_by_category))

print("\nWriting result file {}".format(output))

In [None]:
# Set up the workdir and make tmp input files
workdir = tempfile.mkdtemp(dir=c["dir"]["temp"])
print workdir

# Genelists - PC and PD up only
# Two things done for consistency with previous implementation:
#  - Don't preserve the order from the json--this will save them alphabetically,
#  - load as float64 so that 0s will be written as 0.0 instead of 0
for genelist in ["pc_up", "pd_up"]:
    filepath = os.path.join(workdir, genelist)
    print("writing {}".format(filepath))
    genes_df = pd.DataFrame.from_dict(
        j5["genelists"][genelist],
        orient="columns",
        dtype="float64"
    )
    genes_df.to_csv(filepath, sep="\t", index_label="Gene")

# Geneset aggregation and details per list
# The keys here are both the filename and the key within j7;
# the values are what to name the index col
genesets_info={
    "gene_set_aggregation":"GeneSet",
    "gene_set_details_per_list":"GeneSetName"
}

for geneset in genesets_info :
    filepath = os.path.join(workdir, geneset)
    print("writing {}".format(filepath))
    geneset_df = pd.DataFrame.from_dict(
        j7[geneset],
        orient="columns",
        dtype="str"
    )
    # When writing to file, ensure we preserve the order of the gene sets; from_dict should
    # have done this (since input is an ordered dict) but there is a bug in pandas. Instead,
    # slice it into the order we want, which is the keys of a random column within that set.
    geneset_df.loc[
            pd.Index(j7[geneset][j7[geneset].keys()[0]].keys())
        ].to_csv(filepath, sep="\t", index_label=genesets_info[geneset])


In [None]:
# some abbreviated arguments to shorten the R script argument line --
# these will get reparsed immediately into new variable names

sid=sample_id
wd=workdir
cp=curated_pathways_druggable_genes
dg=druggable_genes_by_category
op=output
# filenames within the workdir
gsa="gene_set_aggregation" 
gsd="gene_set_details_per_list"
pcuf="pc_up" 
pduf="pd_up"

In [None]:
%%script Rscript - "$sid" "$wd" "$cp" "$dg" "$op" "$gsa" "$gsd" "$pcuf" "$pduf"

#### Main ####

args<-commandArgs(TRUE)

sampleID<-args[1]
workDir<-args[2]
pathwayAnnotationFileName<-args[3]
FDA_DruggableGenesByCategoryFile<-args[4]
outputFile<-args[5]

# Relative paths - files within the workdir
genesetAggregationFile<-args[6]
genesetDetailsPerListFile<-args[7]
genes_pc_up<-args[8]
genes_pd_up<-args[9]


library(tidyverse)

setwd(workDir)
pathwayAnnotation=read_tsv(pathwayAnnotationFileName, col_names=FALSE,
                          col_types="cc")
colnames(pathwayAnnotation)=c("geneSet", "anno")

FDA_DruggableGenesByCategory= read_tsv(FDA_DruggableGenesByCategoryFile, col_types="cc")


###
### FDA druggable genes
###

FDA_DruggableGenesByCategory$group=factor(
    FDA_DruggableGenesByCategory$group, 
    levels=unique(FDA_DruggableGenesByCategory$group))

s= sampleID

# Read enriched pathways file and add a column marking whether a pathway is enriched
# in comm, pc, or pd_up
enrichedPathways =read_tsv(genesetAggregationFile, col_types="ciiiiccccilcccc")
enrichedPathways $enrichedOutsideTopFivePercent=rowSums(enrichedPathways[,2:4])>0

# Add k/K as a column to pathway details
pathwayDetailsRaw=read_tsv(genesetDetailsPerListFile, col_types="cicidddidddidddiddd")
pathwayDetailsRaw $comm_up_kkText=paste0(
    pathwayDetailsRaw $`comm_up_N_Genes_in_Overlap_(k)`,
     "/", pathwayDetailsRaw $`N_Genes_in_Gene_Set_(K)`)
pathwayDetailsRaw $pc_up_kkText=paste0(
    pathwayDetailsRaw $`pc_up_N_Genes_in_Overlap_(k)`, 
    "/", pathwayDetailsRaw $`N_Genes_in_Gene_Set_(K)`)
pathwayDetailsRaw $pd_up_kkText=paste0(
    pathwayDetailsRaw $`pd_up_N_Genes_in_Overlap_(k)`,
     "/", pathwayDetailsRaw $`N_Genes_in_Gene_Set_(K)`)

# add the k/K text to enriched pathways    
enrichedPathways$comm_up_kkText= pathwayDetailsRaw $comm_up_kkText[
    match(enrichedPathways$GeneSet, pathwayDetailsRaw$GeneSetName)]
enrichedPathways$pc_up_kkText= pathwayDetailsRaw $pc_up_kkText[
    match(enrichedPathways$GeneSet, pathwayDetailsRaw$GeneSetName)]
enrichedPathways$pd_up_kkText= pathwayDetailsRaw $pd_up_kkText[
    match(enrichedPathways$GeneSet, pathwayDetailsRaw$GeneSetName)]


# Add text - eg - " PATHWAYNAME enriched in comm up (k/K) and pc_up (k/K)    
kkcols=which(grepl("kkText", colnames(enrichedPathways)))
enrichedPathways $textOfEnrichmentList<-NA
# generate textOfEnrichmentList
for (i in 1:nrow(enrichedPathways)){
    # i=1
    whichSets=which(enrichedPathways[i,2:4]==1)
    t2=paste0( colnames(enrichedPathways)[2:4][whichSets], 
        " (", enrichedPathways[i, kkcols][whichSets], ")")
    enrichedPathways $textOfEnrichmentList [i]=gsub(
        ", enriched_in_", " and ", paste(t2, collapse=", "))
}
enrichedPathways $geneSetAndTextOfEnrichmentList=paste0(
    enrichedPathways $GeneSet , " (", enrichedPathways $textOfEnrichmentList, ")")

enrichedPathways$FDA_druggableGenesInGeneSet= lapply(
    enrichedPathways$allMemberGenesInThLists, function(x) strsplit(x, ", ")[[1]])
enrichedPathways$countOfFDA_druggableGenesInGeneSet =unlist(
    lapply(enrichedPathways $FDA_druggableGenesInGeneSet, length))

# Annotate all blank pathway annotations with priority1 and add anno to enriched pathways
pathwayAnnotation$anno[is.na(pathwayAnnotation$anno)]="geneSets_priority1"

enrichedPathways$anno= pathwayAnnotation$anno[match(
    enrichedPathways$GeneSet, pathwayAnnotation$geneSet)]

# make an up outliers frame listing whether druggable    
pc_upBySample=read_tsv(genes_pc_up, col_types="cdd")
pd_upBySample=read_tsv(genes_pd_up, col_types="cdd")
colnames(pd_upBySample)[3]=colnames(pc_upBySample)[3]="median"

# Outliers might be empty, so cbind carefully
if(nrow(pc_upBySample)==0){
    pc_withGroup = cbind(pc_upBySample, outlierComparisonGroup=character())
} else {
    pc_withGroup = cbind(pc_upBySample, outlierComparisonGroup="pc")
}
if(nrow(pd_upBySample)==0){
    pd_withGroup = cbind(pd_upBySample, outlierComparisonGroup=character())
} else {
    pd_withGroup = cbind(pd_upBySample, outlierComparisonGroup="pd")
}

upOutliers=data.frame(rbind(pc_withGroup, pd_withGroup))

colnames(upOutliers)=c("gene", "log2TPMp1", "groupMedian", "outlierComparisonGroup")

upOutliers$category= FDA_DruggableGenesByCategory$group[match(
    upOutliers$gene, FDA_DruggableGenesByCategory$gene)]
upOutliers$hasFdaApprovedTargetedDrug=!is.na(upOutliers$category)


theseOutliers=subset(upOutliers,  gene %in% FDA_DruggableGenesByCategory$gene )
theseEnrichedPathways=subset(enrichedPathways,
      enrichedOutsideTopFivePercent & countOfFDA_druggableGenesInGeneSet>0)

###############
theseOutliers=subset(upOutliers,  gene %in% FDA_DruggableGenesByCategory$gene )
theseEnrichedPathways=subset(enrichedPathways,
      enrichedOutsideTopFivePercent & countOfFDA_druggableGenesInGeneSet>0)

#
# assemble FDA_DruggableGenesPerEnrichedDruggablePathway
#

FDA_DruggableGenesPerEnrichedDruggablePathway<-NULL
outputDF<-NULL

if (nrow(theseEnrichedPathways)>0 & nrow(theseOutliers)>0 ) {        
    theseOutliersByList=aggregate(outlierComparisonGroup ~ gene, theseOutliers, 
        paste, collapse=" and ") # run above
    for (j in 1:nrow(theseEnrichedPathways)){    
        p=theseEnrichedPathways $GeneSet[j]
        thisPathwayInfo=subset(theseEnrichedPathways, GeneSet==p)
        genesInPathway=unlist(thisPathwayInfo $FDA_druggableGenesInGeneSet[
            unlist(lapply(
                thisPathwayInfo $FDA_druggableGenesInGeneSet, length))>0])
        theseFDADruggableGenesInPathway=subset(theseOutliersByList,
             gene %in% genesInPathway)
        # add FDA gene set, e.g. PI3K/AKT/mTOR from theseOutliers

        thisPathwayDescription=gsub("enriched_in_", "", paste0(
            theseEnrichedPathways $GeneSet[j], " (",
            theseEnrichedPathways $textOfEnrichmentList[j], ")"))

        if (nrow(theseFDADruggableGenesInPathway)>0){
            theseFDADruggableGenesInPathway$geneDesc=paste0(
                theseFDADruggableGenesInPathway $gene,
                 " (", theseFDADruggableGenesInPathway$outlierComparisonGroup, ")")
            # Construct druggable genes per pathway frame
            FDA_DruggableGenesPerEnrichedDruggablePathway <-rbind(
                FDA_DruggableGenesPerEnrichedDruggablePathway,
                data.frame(
                    THid= sampleID, 
                    assay= thisPathwayInfo$anno, 
                    results= thisPathwayDescription, 
                    details=paste(
                        "druggable:", 
                        paste0(theseFDADruggableGenesInPathway$geneDesc, collapse=", ")),
                     details2=paste(
                        "overlap_genes_in_pc_up",
                         paste(
                            thisPathwayInfo$overlap_genes_in_pc_up,
                            "and pd_up",
                            thisPathwayInfo$overlap_genes_in_pd_up ))))
        }
    }
} else {
        FDA_DruggableGenesPerEnrichedDruggablePathway <-rbind(
            FDA_DruggableGenesPerEnrichedDruggablePathway,
            data.frame(
                THid= sampleID,
                assay="geneSets",
                results= "no enriched druggable geneSets", 
                details="", details2=""))
}

##########################################
### Note FDA Druggable up outliers
###################################

if (nrow(theseOutliers)>0){
    allDruggableUpOutliers <-data.frame(
        THid= sampleID,
         assay="druggableUpOutlier",
         theseOutliers %>% group_by(gene) %>% summarize (
            outlierGroup=paste(outlierComparisonGroup, collapse=", ")))
    colnames(allDruggableUpOutliers)[3:4]=c("results", "details")
} else {
    allDruggableUpOutliers <-data.frame(
        THid= sampleID,
        assay="druggableUpOutlier", results ="No druggableUpOutliers", details="")
}
allDruggableUpOutliers$details2=""


outputDF=rbind(FDA_DruggableGenesPerEnrichedDruggablePathway, allDruggableUpOutliers)
assayOrder=c(
    "druggableUpOutlier",
    "geneSets_priority1",
    "geneSets_broadCancer", 
    "geneSets_nonCancer" )
outputDF $assay =factor(outputDF $assay, levels=unique(c(assayOrder, outputDF $assay)))
outputDF <-arrange(outputDF, assay)


write_tsv(outputDF, outputFile)

Fetch the TSV output from the tempdir:

In [None]:
!pwd
src_path = os.path.join(workdir, output)
print("Moving {} to {}".format(src_path, output))
os.rename(src_path, output)

Load it into JSON.

In [None]:
j["automated_leads_identified"]=json.loads(
    pd.read_csv(output, delimiter="\t", dtype="str", na_filter=False
               ).to_json(orient="columns"),object_pairs_hook=OrderedDict)


Next, calculate the druggable genes and pathways.

Druggable genes is simply the count of the dgidb results for that set.
Druggable pathways:

For pancancer-up, for example, we take the "druggableGenesInThLists" list of genes associated with each
pathway in "gene_set_aggregation". We see if there is any overlap between that list and the pc_up dgidb genes.
If there is, we count that pathway as one; otherwise it does not add to the count.

So the value for pathways - pc_up is the total number of pathways for which at least one of their druggableGenesInThLists genes was in the pc_up dgidb results.

In [None]:
j["druggable_genes_count"] = {}
j["druggable_pathways_count"] = {}

all_pathways = j7["gene_set_aggregation"]["druggableGenesInThLists"]
dgidb_results = j5["dgidb_results"]

# Given a pathway name and pc/pd etc geneset name, return the set of genes in that pathway
# that are also in the geneset
def found_genes(pathway, geneset):
    druggable_genes_in_geneset = dgidb_results[geneset].keys()
    genes_in_pathway = all_pathways[pathway].split(", ")
    return set(druggable_genes_in_geneset).intersection(genes_in_pathway)
    

for geneset in ["pc_up", "pd_up", "comm_up", "top5"]:
    # Genes - count of dgidb results for that set
    j["druggable_genes_count"][geneset] = len(dgidb_results[geneset])

    # Pathways - count of pathways with a gene that's also in the geneset
    j["druggable_pathways_count"][geneset] = len(
        filter(lambda genes: len(genes) > 0, 
               map(lambda pathway: found_genes(pathway, geneset), all_pathways)))
    
print "Druggable genes count: {}".format(j["druggable_genes_count"])
print "Druggable pathways count: {}".format(j["druggable_pathways_count"])

Save the JSON results.

In [None]:
with open(c["json"]["8"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)

Finally, delete the tempfiles and tempdir.

In [None]:
filenames = ["pc_up", "pd_up","gene_set_aggregation","gene_set_details_per_list"]
for fname in filenames:
    fpath = os.path.join(workdir, fname)
    print("Deleting temp file {}".format(fpath))
    os.remove(fpath)
print("Deleting temp dir {}".format(workdir))
os.rmdir(workdir)

In [None]:
print("Done!")