### Detecting human orthologs of mouse physical interactors of Huntingtin protein.
#### Shirasaki et al_2012

In [2]:
path <- "C:/Users/Sonali/Box Sync/Huntington Interactome/MutantHTT_Interactors/Mouse"
setwd(path)

########################################
###Biomart and Inparanoid package installation####
#######################################
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
# Install Homology information for Mus musculus from Inparanoid
biocLite("hom.Mm.inp.db")

#####################################################
### Get the required libararies
#####################################################
library("AnnotationDbi")
library("hom.Mm.inp.db")
library("biomaRt")
library ("plyr") 


# #########################################
# ##To view documentation for the version of this package ###
# ############################################################
# browseVignettes("biomaRt")
# 
# ###################################################
# ### code chunk number 2: biomaRt
# ###################################################
# library("biomaRt")
#listMarts(host = 'oct2014.archive.ensembl.org')
# http://oct2014.archive.ensembl.org/index.html
# 
# 
# ###################################################
# ### code chunk number 4: listDatasets
# ###################################################
#ensembl=useMart("ensembl")
ensembl77 = useMart(host='dec2014.archive.ensembl.org', 
                    biomart='ENSEMBL_MART_ENSEMBL')
#listDatasets(ensembl77)

#############################################################
### Read the file containing the gene symbols of the organism
###############################################################

mouseGenes <- read.table(file= "R_Input_Shirasaki_etal2012.txt", sep = "\t", header = TRUE)

# convert your Gene Symbols to ensemble gene id and ensembl protein id for your organism

mouse = useMart(host='dec2014.archive.ensembl.org',biomart = 'ENSEMBL_MART_ENSEMBL' , dataset = "mmusculus_gene_ensembl") ### specify the dataset to use 

###################################################
### filters for the mouse dataset
###################################################
filters = listFilters(mouse)
filters[1:5,]
#listFilters(mouse)

###################################################
### attributes for mouse dataset
###################################################
attributes = listAttributes(mouse)

attributes[1:5,]
#listAttributes(mouse, page="feature_page")


########################################
## Get the attributes for mouse genes 
#######################################


rawShirasakiENSProtIds <- getBM(c('ensembl_gene_id', 'hgnc_symbol','ensembl_peptide_id','hgnc_id','mgi_id'), filters= 'external_gene_name', values= mouseGenes$AssociatedGeneName, mart=mouse) # convert Gene Symbol to ensemble gene id and ensembl protein id for your organism

length (rawShirasakiENSProtIds)
dim (rawShirasakiENSProtIds)
class(rawShirasakiENSProtIds)
#rawShirasakiENSProtIds

##################################################################################
### clean your data - remove any empty strings in the column of Ensembl peptide id
###################################################################################
z <- rawShirasakiENSProtIds[!rawShirasakiENSProtIds$ensembl_peptide_id == "",] # remove rows from data frame where a row matches a string 
dim(z)

#########################################
# remove duplicate values from the ensembl protein id column
#######################################
ShirasakiENSProtIds <- z[!duplicated(z[,3]),]
#ShirasakiENSProtIds
dim(ShirasakiENSProtIds)


##############################################
# Using the Inparanoid function to find human orthologs in form of human ensembl protein IDs#
##################################################

rawHumanProtIds <- mget(unlist(ShirasakiENSProtIds$ensembl_peptide_id) ,hom.Mm.inpHOMSA,ifnotfound=NA)

#rawHumanProtIds
class(rawHumanProtIds)
HumanProtIds <- rawHumanProtIds[!is.na(rawHumanProtIds)] ## remove all the NA values from the list
#HumanProtIds
length(HumanProtIds)

##########################################################################
## Find the ensembl gene id annd hgnc symbols for your human ensembl peptide id 
## using the getBM function and the human ensembl dataset
#######################################################################

human = useMart(host='dec2014.archive.ensembl.org',biomart = 'ENSEMBL_MART_ENSEMBL' , dataset = "hsapiens_gene_ensembl")
#listAttributes(human, page="feature_page")

Humanorthologs <- getBM(c('ensembl_gene_id', 'hgnc_symbol', 'ensembl_peptide_id', 'entrezgene','uniprot_swissprot'), filters='ensembl_peptide_id', values=HumanProtIds, mart=human) # convert Gene Symbol to ensemble gene id and ensembl protein id for your organism
Humanorthologs
dim(Humanorthologs)

# run the line below if you need output as a text file.
# write.table(Humanorthologs, file="ShirasakiHumanOrthologs.txt", sep="\t", col.names = NA) ### export file in text format to the folder.


##############################################################################################
##########END ################################################################################
#############################################################################################

sessionInfo()


Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.1 (2016-06-21).
Installing package(s) 'biomaRt'
"package 'biomaRt' is in use and will not be installed"installation path not writeable, unable to update packages: cluster, codetools,
  foreign, lattice, Matrix, mgcv, nlme, survival
Old packages: 'assertthat', 'curl', 'matrixStats', 'party', 'zoo'
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.1 (2016-06-21).
Installing package(s) 'hom.Mm.inp.db'
installing the source package 'hom.Mm.inp.db'

installation path not writeable, unable to update packages: cluster, codetools,
  foreign, lattice, Matrix, mgcv, nlme, survival
Old packages: 'assertthat', 'curl', 'matrixStats', 'party', 'zoo'


name,description
chromosome_name,Chromosome name
start,Gene Start (bp)
end,Gene End (bp)
band_start,Band Start
band_end,Band End


name,description,page
ensembl_gene_id,Ensembl Gene ID,feature_page
ensembl_transcript_id,Ensembl Transcript ID,feature_page
ensembl_peptide_id,Ensembl Protein ID,feature_page
ensembl_exon_id,Ensembl Exon ID,feature_page
description,Description,feature_page


ensembl_gene_id,hgnc_symbol,ensembl_peptide_id,entrezgene,uniprot_swissprot
ENSG00000006451,RALA,ENSP00000005257,5898,P11233
ENSG00000022355,GABRA1,ENSP00000023897,2554,P14867
ENSG00000033122,LRRC7,ENSP00000035383,57554,Q96NW7
ENSG00000054803,CBLN4,ENSP00000064571,140689,Q9NTU7
ENSG00000013297,CLDN11,ENSP00000064724,5010,O75508
ENSG00000059804,SLC2A3,ENSP00000075120,6515,P11169
ENSG00000059804,SLC2A3,ENSP00000075120,102723606,P11169
ENSG00000088256,GNA11,ENSP00000078429,2767,P29992
ENSG00000063177,RPL18,ENSP00000084795,6141,
ENSG00000013275,PSMC4,ENSP00000157812,5704,P43686


R version 3.3.1 (2016-06-21)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 14393)

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
[1] plyr_1.8.4           biomaRt_2.30.0       hom.Mm.inp.db_3.1.2 
[4] AnnotationDbi_1.36.2 IRanges_2.8.2        S4Vectors_0.12.2    
[7] Biobase_2.34.0       BiocGenerics_0.20.0  BiocInstaller_1.24.0

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.10        magrittr_1.5        uuid_0.1-2         
 [4] R6_2.2.0            stringr_1.2.0       tools_3.3.1        
 [7] DBI_0.6-1           digest_0.6.12       crayon_1.3.2       
[10] IRdisplay_0.4.4     repr_0.12.0         bitops_1.0-6       
[13] RC