Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
434 lines (346 sloc) 21.2 KB
---
title: "Canonical Correspondence Analysis (CCA) and Redundancy Analysis (RDA) for Protein Abundance Data"
author: "Yaamini Venkataraman"
date: "November 27, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE) #Set up R Markdown File
```
In this script, I'll perform a constrained ordination to understand the protein abundance data given the environmental data.
# Install dependencies
```{r}
#install.packages("vegan")
require(vegan)
#install.packages("cluster") #Install cluster package. This package has the function daisy that is used for the gower dissimilarity matrix.
require(cluster)
#install.packages("dplyr")
require(dplyr)
source("../../biostats.R") #Biostats analysis wrapper
```
# Obtain session information
```{r}
sessionInfo() #Obtain session information
```
# Import and reformat data
## Protein abundance data
```{r}
proteinAbundance <- read.csv("2017-11-05-Averaged-Areas-Pivoted-Corrected.csv", header = TRUE) #Import protein abundance data
row.names(proteinAbundance) <- proteinAbundance[,1] #Assign first column as rownames
proteinAbundance <- proteinAbundance[,-1] #Remove first column
proteinAbundance <- t(proteinAbundance) #Transpose dataframe
head(proteinAbundance) #Confirm changes
```
### Transform protein data
```{r}
proteinAbundanceHT <- data.trans(proteinAbundance, method = 'hellingers', plot = FALSE) #Hellinger (asymmetric) transformation
head(proteinAbundanceHT) #Confirm transformation
```
## Environmental data
For the environmental data, I want my rownames to be oyster samples, and my columns to be Site, Habitat, and the mean and variance of environmental variables at the designated Site and Habitat for the entire outplant.
### Calculate outplant means and variances
#### pH
```{r}
pHData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-pH-Tide-Data-Corrected.csv", header = TRUE) #Import pH data
pHData <- pHData[,-1] #Remove extra first column
head(pHData) #Confirm import
```
```{r}
colnames(pHData)
```
```{r}
pHMeanVariance <- data.frame("Site-Habitat" = colnames(pHData[4:12]),
"Site" = c("WB", "WB", "SK", "SK", "PG", "CI", "CI", "FB", "FB"),
"Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"),
"pHMean" = rep(0, times = 9),
"pHVariance" = rep(0, times = 9)) #Create an empty dataframe to store outplant-wide means and variances
head(pHMeanVariance) #Confirm dataframe creation
```
```{r}
nSiteHabitat <- 12 #pH data is column 4-12
for(i in 4:nSiteHabitat) { #For each column with pH data
pHMeanVariance$pHMean[i-3] <- mean(pHData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row in pHMeanVariance$Mean
}
for(i in 4:nSiteHabitat) { #For each column with pH data
pHMeanVariance$pHVariance[i-3] <- var(pHData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row in pHMeanVariance$Variance
}
pHMeanVariance <- pHMeanVariance[,-1] #Remove Site.Habitat column
head(pHMeanVariance) #Confirm calculations
```
#### Dissolved oxygen
```{r}
DOData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-DO-Tide-Data-Corrected.csv", header = TRUE) #Import DO data
DOData <- DOData[,-1] #Remove extra first column
head(DOData) #Confirm import
```
```{r}
colnames(DOData) #Look at columns to identify where the DO data is
```
```{r}
DOMeanVariance <- data.frame("Site-Habitat" = colnames(DOData[4:13]),
"Site" = c("WB", "WB", "SK", "SK", "PG", "PG", "CI", "CI", "FB", "FB"),
"Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"),
"DOMean" = rep(0, times = 10),
"DOVariance" = rep(0, times = 10)) #Create an empty dataframe to store outplant-wide means and variances
head(DOMeanVariance) #Confirm dataframe creation
```
```{r}
nSiteHabitat <- 13 #DO data is column 4-13
for(i in 4:nSiteHabitat) { #For each column with DO data
DOMeanVariance$DOMean[i-3] <- mean(DOData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row in DOMeanVariance$DOMean
}
for(i in 4:nSiteHabitat) { #For each column with pH data
DOMeanVariance$DOVariance[i-3] <- var(DOData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row in DOMeanVariance$DOVariance
}
DOMeanVariance <- DOMeanVariance[,-1] #Remove Site.Habitat column
head(DOMeanVariance) #Confirm calculations
```
#### Salinity
```{r}
salinityData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-Salinity-Tide-Data-Corrected.csv", header = TRUE) #Import pH data
salinityData <- salinityData[,-1] #Remove extra first column
head(salinityData) #Confirm import
```
```{r}
colnames(salinityData)
```
```{r}
salinityMeanVariance <- data.frame("Site-Habitat" = colnames(salinityData[4:11]),
"Site" = c("CI", "CI", "FB", "FB", "PG", "SK", "SK", "WB"),
"Habitat" = c("Bare", "Eelgrass", "Bare", "Eelgrass", "Eelgrass", "Bare", "Eelgrass", "Bare"),
"salinityMean" = rep(0, times = 8),
"salinityVariance" = rep(0, times = 8)) #Create an empty dataframe to store outplant-wide means and variances
head(salinityMeanVariance) #Confirm dataframe creation
```
```{r}
nSiteHabitat <- 11 #pH data is column 4-11
for(i in 4:nSiteHabitat) { #For each column with pH data
salinityMeanVariance$salinityMean[i-3] <- mean(salinityData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row
}
for(i in 4:nSiteHabitat) { #For each column with pH data
salinityMeanVariance$salinityVariance[i-3] <- var(salinityData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row
}
salinityMeanVariance <- salinityMeanVariance[,-1] #Remove Site.Habitat column
head(salinityMeanVariance) #Confirm calculations
```
#### Temperature
```{r}
temperatureData <- read.csv("../../2017-11-15-Environmental-Data-and-Biomarker-Analyses/2017-12-13-Environmental-Data-Quality-Control/2018-11-18-Temperature-Corrected.csv", header = TRUE) #Import temperature data
temperatureData <- temperatureData[,-1] #Remove extra first column
head(temperatureData) #Confirm import
```
```{r}
colnames(temperatureData) #Figure out where data is
```
```{r}
temperatureMeanVariance <- data.frame("Site-Habitat" = colnames(temperatureData[3:12]),
"Site" = c("WB", "WB", "SK", "SK", "PG", "PG", "CI", "CI", "FB", "FB"),
"Habitat" = c("Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare", "Eelgrass", "Bare"),
"temperatureMean" = rep(0, times = 10),
"temperatureVariance" = rep(0, times = 10)) #Create an empty dataframe to store outplant-wide means and variances
head(temperatureMeanVariance) #Confirm dataframe creation
```
```{r}
nSiteHabitat <- 12 #pH data is column 3-12
for(i in 3:nSiteHabitat) { #For each column with pH data
temperatureMeanVariance$temperatureMean[i-2] <- mean(temperatureData[,i], na.rm = TRUE) #Calculate the mean and put it in the designated row
}
for(i in 3:nSiteHabitat) { #For each column with pH data
temperatureMeanVariance$temperatureVariance[i-2] <- var(temperatureData[,i], na.rm = TRUE) #Calculate the variance and put it in the designated row
}
temperatureMeanVariance <- temperatureMeanVariance[,-1] #Remove Site.Habitat column
head(temperatureMeanVariance) #Confirm calculations
```
### Merge environmental data
```{r}
environmentalData <- left_join(temperatureMeanVariance, pHMeanVariance, by = c("Site", "Habitat")) #Use left_join to merge dataframes and add NAs where the two dataframes do not match. temperatureData is used as the base because it has observations for all 10 Site and Habitat combinations
environmentalData <- left_join(environmentalData, DOMeanVariance, by = c("Site", "Habitat")) #Add salinity data
environmentalData <- left_join(environmentalData, salinityMeanVariance, by = c("Site", "Habitat")) #Add temperature data
head(environmentalData)
```
### Combine oyster samples with environmental data
```{r}
biologicalReplicates <- read.csv("../../2017-09-06-Biological-Replicate-Information.csv", header = TRUE) #Import biological replciate information
head(biologicalReplicates) #Confirm import
```
```{r}
biologicalReplicates$Sample.Number <- as.character(biologicalReplicates$Sample.Number) #Convert sample number to character string
biologicalReplicates$Sample.Number <- substr(biologicalReplicates$Sample.Number, 1, nchar(biologicalReplicates$Sample.Number)-2) #Remove -1 or -2 from end of sample number
biologicalReplicates <- biologicalReplicates[1:49,] #Keep only the first 50 rows, since everything repeats. Also eliminate OBLNK2
colnames(biologicalReplicates)[3] <- "Habitat" #Change Eelgrass.Condition to Habitat
tail(biologicalReplicates) #Confirm changes
```
```{r}
colnames(environmentalData)
```
```{r}
environmentalSampleData <- data.frame("Sample.Number" = biologicalReplicates$Sample.Number,
"Site" = biologicalReplicates$Site,
"Habitat" = biologicalReplicates$Habitat,
"temperatureMean" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"temperatureVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"pHMean" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"pHVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"DOMean" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"DOVariance" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"salinityMean" = rep(0, times = length(biologicalReplicates$Sample.Number)),
"salinityVariance" = rep(0, times = length(biologicalReplicates$Sample.Number))) #Create a new dataframe for sample and environmental data. Column names follow the same ordering as environmentalData
head(environmentalSampleData) #Confirm dataframe creation
```
```{r}
colnames(environmentalSampleData)
```
```{r}
environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Bare", 4:11]
```
```{r}
environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Bare",3:10]
```
```{r}
#Replace 0s in environmentalSampleData using corresponding information from environmentalData
#Case Inlet
environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Bare", 3:10]
environmentalSampleData[environmentalSampleData$Site == "CI" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "CI" & environmentalData$Habitat == "Eelgrass", 3:10]
#Fidalgo Bay
environmentalSampleData[environmentalSampleData$Site == "FB" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "FB" & environmentalData$Habitat == "Bare", 3:10]
environmentalSampleData[environmentalSampleData$Site == "FB" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "FB" & environmentalData$Habitat == "Eelgrass", 3:10]
#Port Gamble Bay
environmentalSampleData[environmentalSampleData$Site == "PG" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "PG" & environmentalData$Habitat == "Bare", 3:10]
environmentalSampleData[environmentalSampleData$Site == "PG" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "PG" & environmentalData$Habitat == "Eelgrass", 3:10]
#Skokomish River Delta
environmentalSampleData[environmentalSampleData$Site == "SK" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "SK" & environmentalData$Habitat == "Bare", 3:10]
environmentalSampleData[environmentalSampleData$Site == "SK" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "SK" & environmentalData$Habitat == "Eelgrass", 3:10]
#Willapa Bay
environmentalSampleData[environmentalSampleData$Site == "WB" & environmentalSampleData$Habitat == "Bare", 4:11] <- environmentalData[environmentalData$Site == "WB" & environmentalData$Habitat == "Bare", 3:10]
environmentalSampleData[environmentalSampleData$Site == "WB" & environmentalSampleData$Habitat == "Eelgrass", 4:11] <- environmentalData[environmentalData$Site == "WB" & environmentalData$Habitat == "Eelgrass", 3:10]
head(environmentalSampleData) #Confirm replacements happened
```
### Transform environmental data
```{r}
row.names(environmentalSampleData) <- environmentalSampleData$Sample.Number #Assign Sample.Number as row names
environmentalSampleData <- environmentalSampleData[,-1] #Remove Sample.Number column
head(environmentalSampleData) #Confirm changes
```
```{r}
environmentalSampleData.log <- log(environmentalSampleData[,-c(1:2)] + 1) #Log transform all columns that aren't Site and Habitat
head(environmentalSampleData.log) #Confirm changes
```
```{r}
environmentalSampleData.log <- environmentalSampleData.log[rownames(proteinAbundanceHT),] #Ensure that samples in environmental data are those from the protein abundance data
length(environmentalSampleData.log$temperatureMean) == length(proteinAbundanceHT$`CHOYP_ACAA2.1.1|m.30666 ELGLNNDITNMNGGAIALGHPLAASGTR`) #The column lengths are equal, so we are good to go.
```
```{r}
#write.csv(environmentalSampleData.log, "2018-12-01-Log-Transformed-Daily-Environmental-Values.csv", row.names = TRUE) #Write out dataframe for future analyses.
```
# Determine the appropriate response model
```{r}
decorana(proteinAbundanceHT, ira = 0) #Determine gradient length. Gradient length <2 is best represented by a linear model (RDA), 2-4 could be unimodal (RDA or CCA appropriate), and >4 is greater than unimodal (CCA). The length of DCA1 is 0.372921, so the underlying model is linear. I will use RDA.
```
# Conducting RDA
```{r}
protEnvRDA <- rda(proteinAbundanceHT~., environmentalSampleData.log, na.action = na.exclude)
summary(protEnvRDA) #Look at the summary. Total inertia is the total variation explained by the RDA. Constrained inertia is the variation explained in Y by X. Unconstrained inertia is everything else.
```
## Significance tests
```{r}
anova(protEnvRDA) #Test significance of the RDA in general. F = 1.3064, p = 0.195, so the constrained ordination is not significant.
```
```{r}
anova(protEnvRDA, by = 'axis') #Test significance of each axis. No axis is significant.
```
```{r}
anova(protEnvRDA, by = 'terms') #Test significance of each predictor variable from X. The mean and variance of temperature are marginally significant (p = 0.065 and 0.067 respectively)
```
## Ordination biplot
```{r}
plot(protEnvRDA, choices = c(1,2), type = 'points', display = 'wa', scaling = 2) #Visualize the ordination by plotting the weighted average scores. Scaling method optimizes for descriptors.
```
```{r}
plot(protEnvRDA, choices = c(1,2), type = 'n', display = 'wa', scaling = 2) #Visualize the WA scores
text(protEnvRDA, choices = c(1,2), labels = row.names(proteinAbundanceHT)) #Add sample ID instead of points
```
# Interpreting the constrained ordination
```{r}
round(intrasetcor(protEnvRDA), 5) #Obtain the intra-set correlation (structure) coefficients
```
```{r}
round(intersetcor(protEnvRDA), 5) #Obtain the inter-set correlation (structure) coefficients
```
```{r}
summary(protEnvRDA) #Get biplot scores for explanatory variables
```
```{r}
plot(protEnvRDA, choices = c(1,2), display = c('wa', 'sp', 'bp'), scaling = 2) #Obtain the triplot with weighted average scores (wa), species scores (sp), and environmental variable biplot scores (bp). Scale to enhance descriptors.
```
#Customize RDA
## Create a customization matrix
```{r}
temporaryData <- data.frame("Sample.Number" = biologicalReplicates$Sample.Number,
y = rep(x = 0, times = length(biologicalReplicates$Sample.Number))) #Create a temporary dataframe with sample names
head(temporaryData) #Confirm dataframe creation
```
```{r}
NMDSColorShapeCustomization <- merge(x = temporaryData, y = biologicalReplicates, by = "Sample.Number") #Merge biological information with samples used
NMDSColorShapeCustomization <- NMDSColorShapeCustomization[,-2] #Remove empty column
head(NMDSColorShapeCustomization) #Confirm removal
```
```{r}
#Add region information (Puget Sound vs. Willapa Bay)
attach(NMDSColorShapeCustomization)
NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Site),] #Reorder so sites are sorted alphabetically
head(NMDSColorShapeCustomization) #Confirm sorting
detach(NMDSColorShapeCustomization)
NMDSColorShapeCustomization$Region <- c(rep("PS", times = (length(NMDSColorShapeCustomization$Site)-6)), rep("WB", times = 6)) #Add regional information
NMDSColorShapeCustomization$NMDS.Region.Shapes <- c(rep(20, times = (length(NMDSColorShapeCustomization$Site)-6)), rep(8, times = 6))
head(NMDSColorShapeCustomization) #Confirm changes
tail(NMDSColorShapeCustomization) #Confirm changes
```
```{r}
#Create a color and shape palette
attach(NMDSColorShapeCustomization)
NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Site),] #Reorder so sites are sorted alphabetically
head(NMDSColorShapeCustomization) #Confirm sorting
detach(NMDSColorShapeCustomization)
NMDS.Colors <- c(rep(x = "#00A9BD", times = sum(NMDSColorShapeCustomization$Site == "CI")),
rep(x = "#38001C", times = sum(NMDSColorShapeCustomization$Site == "FB")),
rep(x = "#440D82", times = sum(NMDSColorShapeCustomization$Site == "PG")),
rep(x = "#017A74", times = sum(NMDSColorShapeCustomization$Site == "SK")),
rep(x = "#EB8B0C", times = sum(NMDSColorShapeCustomization$Site == "WB"))) #Create a color vector
NMDSColorShapeCustomization[,6] <- NMDS.Colors #Add the color vector to the dataframe
head(NMDSColorShapeCustomization) #Confirm addition
attach(NMDSColorShapeCustomization)
NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Habitat),] #Reorder so habitat is sorted alphabetically
head(NMDSColorShapeCustomization) #Confirm sorting
detach(NMDSColorShapeCustomization)
NMDS.Shapes <- c(rep(x = 1, times = sum(NMDSColorShapeCustomization$Habitat == "Bare")),
rep(x = 16, times = sum(NMDSColorShapeCustomization$Habitat == "Eelgrass"))) #Make a shape vector
NMDSColorShapeCustomization[,7] <- NMDS.Shapes #Add the shape vector to the dataframe
head(NMDSColorShapeCustomization) #Confirm addition
attach(NMDSColorShapeCustomization)
NMDSColorShapeCustomization <- NMDSColorShapeCustomization[order(Sample.Number),] #Resort by sample number
head(NMDSColorShapeCustomization) #Confirm sorting
detach(NMDSColorShapeCustomization)
colnames(NMDSColorShapeCustomization) <- c("Sample.Number", "Site", "Eelgrass.Condition", "Region", "Region.Shape", "Color", "Shape") #Change column names
head(NMDSColorShapeCustomization) #Confirm changes
tail(NMDSColorShapeCustomization) #Confirm changes
```
### All proteins and predictors
```{r}
plot(protEnvRDA, choices=c(1,2), type = 'none', scaling = 2) #Create an empty plot based on RDA dimensions
points(protEnvRDA, choices = c(1,2), display = 'wa', pch = NMDSColorShapeCustomization$Shape, cex = 1, scaling = 2, col = NMDSColorShapeCustomization$Color) #Plot objects as points
points(protEnvRDA, choices = c(1,2), display = 'sp', pch = 22, col = 'grey20', bg = "grey20", cex = 0.5, scaling = 2) #Plot descriptors as text
text(protEnvRDA, choices = c(1,2), display = 'bp', col = 'grey20', cex = 0.75) #Plot only marginally significant predictors
```
### Only significant proteins and predictors
```{r}
plot(protEnvRDA, choices=c(1,2), type = 'none', scaling = 2, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Create an empty plot based on RDA dimensions
points(protEnvRDA, choices = c(1,2), display = 'wa', pch = NMDSColorShapeCustomization$Shape, cex = 1, scaling = 2, col = NMDSColorShapeCustomization$Color) #Plot objects as points
points(protEnvRDA, choices = c(1,2), display = 'sp', col = 'grey20', pch = 4, cex = 1, scaling = 2, select = c(5, 12, 13, 19, 22, 30, 33)) #Plot significant proteins
text(protEnvRDA, choices = c(1,2), display = 'bp', col = 'grey20', cex = 0.75, select = 1:2) #Plot only marginally significant predictors
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 0.75)
mtext(side = 1, text = "RDA1", line = 2)
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 0.75)
mtext(side = 2, text = "RDA2", line = 2)
box(col = "grey80")
legend("topleft", pch = c(rep(x = 1, times = 6), 16), legend=c('Case Inlet', "Fidalgo Bay", "Willapa Bay", "Skokomish", "Port Gamble", "Bare", "Eelgrass"), col=c('#00A9BD', '#38001C', '#440D82', '#017A74', '#EB8B0C', 'black', 'black'), cex = 0.5, bty = "n")
```