In [1]:
library(gelnet)
library(dplyr)
library(biomaRt)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
# Maps ENSEMBL IDs to HUGO
# Use srcType = "ensembl_gene_id" for Ensembl IDs
# Use srcType = "entrezgene" for Entrez IDs
genes2hugo <- function( v, srcType = "ensembl_gene_id" )
{
    ## Retrieve the EMSEMBL -> HUGO mapping
    ensembl <- biomaRt::useMart( "ENSEMBL_MART_ENSEMBL", host="www.ensembl.org", dataset="hsapiens_gene_ensembl" )
    ID <- biomaRt::getBM( attributes=c(srcType, "hgnc_symbol"), filters=srcType, values=v, mart=ensembl )

    ## Make sure there was at least one mapping
    if( nrow(ID) < 1 ) top( "No IDs mapped successfully" )

    ## Drop empty duds
    j <- which( ID[,2] == "" )
    if( length(j) > 0 ) ID <- ID[-j,]
    stopifnot( all( ID[,1] %in% v ) )

    ID
}


In [4]:
## Load the signature
fnSig = "../data/PCBC/pcbc-stemsig.tsv"
w <- read.delim( fnSig, header=FALSE, row.names=1 ) %>% as.matrix() %>% drop()

In [13]:
X <- read.delim("../data/TcgaTargetGtex_RSEM_Hugo_norm_count.txt", header=TRUE, row.names=1)

In [11]:
V <- genes2hugo( rownames(X) )
head(V)

                                                                      

ERROR: Error in top("No IDs mapped successfully"): could not find function "top"


In [None]:
X <- X[V[,1],]
rownames(X) <- V[,2]
X[1:3,1:3]

In [3]:
## Uses the signature stored in fnSig to score PanCan33 data and stores the result to fnOut
main.predict <- function( fnSig = "../data/PCBC/pcbc-stemsig.tsv", fnOut = "mRNA_StemScore.tsv" )
{
  ## Load the signature
  w <- read.delim( fnSig, header=FALSE, row.names=1 ) %>% as.matrix() %>% drop()

  ## Reduces HUGO|POSITION gene IDs to just HUGO
  #f <- function( v ) unlist( lapply( strsplit( v, "\\|" ), "[[", 1 ) )

  s <- synGet( "syn4976369", downloadLocation = "/data/pancan" )
  X <- read.delim( s@filePath, as.is=TRUE, check.names=FALSE ) %>%  ## Read the raw values
    filter( !grepl( "\\?", gene_id ) ) %>%      ## Drop genes with no mapping to HUGO
    mutate( gene_id = f( gene_id ) ) %>%        ## Clip gene ids to HUGO
    filter( gene_id %in% names(w) )         ## Reduce to the signature's gene set

  ## SLC35E2 has multiple entries with the same HUGO id
  ## Keep the first entry only
  j <- grep( "SLC35E2", X[,1] )
  if( length(j) > 1 )
    X <- X[-j[-1],]

  ## Convert to a matrix
  rownames(X) <- NULL
  X <- X %>% tibble::column_to_rownames( "gene_id" ) %>% as.matrix()

  ## Reduce the signature to the common set of genes
  stopifnot( all( rownames(X) %in% names(w) ) )
  w <- w[ rownames(X) ]

  ####### Score via Spearman correlation
  s <- apply( X, 2, function(z) {cor( z, w, method = "sp", use = "complete.obs" )} )

  ## Scale the scores to be between 0 and 1
  s <- s - min(s)
  s <- s / max(s)

  write.table(cbind(s), file = fnOut, sep = "\t", quote = FALSE, col.names = FALSE)
}


In [None]:
tissue_info[(tissue_info['SMTS'] == 'Prostate') & (tissue_info['SMSTYP'] == 'Normal')]