Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
134 lines (114 sloc)
4.48 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
## In this example, the data is in a matrix called | |
## data.matrix | |
## columns are individual samples (i.e. cells) | |
## rows are measurements taken for all the samples (i.e. genes) | |
## Just for the sake of the example, here's some made up data... | |
data.matrix <- matrix(nrow=100, ncol=10) | |
colnames(data.matrix) <- c( | |
paste("wt", 1:5, sep=""), | |
paste("ko", 1:5, sep="")) | |
rownames(data.matrix) <- paste("gene", 1:100, sep="") | |
for (i in 1:100) { | |
wt.values <- rpois(5, lambda=sample(x=10:1000, size=1)) | |
ko.values <- rpois(5, lambda=sample(x=10:1000, size=1)) | |
data.matrix[i,] <- c(wt.values, ko.values) | |
} | |
head(data.matrix) | |
dim(data.matrix) | |
################################################################### | |
## | |
## 1) Just for reference, draw a PCA plot using this data... | |
## | |
################################################################### | |
pca <- prcomp(t(data.matrix), scale=TRUE, center=TRUE) | |
## calculate the percentage of variation that each PC accounts for... | |
pca.var <- pca$sdev^2 | |
pca.var.per <- round(pca.var/sum(pca.var)*100, 1) | |
pca.var.per | |
## now make a fancy looking plot that shows the PCs and the variation: | |
pca.data <- data.frame(Sample=rownames(pca$x), | |
X=pca$x[,1], | |
Y=pca$x[,2]) | |
pca.data | |
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) + | |
geom_text() + | |
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) + | |
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) + | |
theme_bw() + | |
ggtitle("PCA Graph") | |
################################################################### | |
## | |
## 2) Now draw an MDS plot using the same data and the Euclidean | |
## distance. This graph should look the same as the PCA plot | |
## | |
################################################################### | |
## first, calculate the distance matrix using the Euclidian distance. | |
## NOTE: We are transposing, scaling and centering the data just like PCA. | |
distance.matrix <- dist(scale(t(data.matrix), center=TRUE, scale=TRUE), | |
method="euclidean") | |
## do the MDS math (this is basically eigen value decomposition) | |
mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE) | |
## calculate the percentage of variation that each MDS axis accounts for... | |
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1) | |
mds.var.per | |
## now make a fancy looking plot that shows the MDS axes and the variation: | |
mds.values <- mds.stuff$points | |
mds.data <- data.frame(Sample=rownames(mds.values), | |
X=mds.values[,1], | |
Y=mds.values[,2]) | |
mds.data | |
ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + | |
geom_text() + | |
theme_bw() + | |
xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) + | |
ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) + | |
ggtitle("MDS plot using Euclidean distance") | |
################################################################### | |
## | |
## 3) Now draw an MDS plot using the same data and the average log(fold change) | |
## This graph should look different than the first two | |
## | |
################################################################### | |
## first, take the log2 of all the values in the data.matrix. | |
## This makes it easy to compute log2(Fold Change) between a gene in two | |
## samples since... | |
## | |
## log2(Fold Change) = log2(value for sample 1) - log2(value for sample 2) | |
## | |
log2.data.matrix <- log2(data.matrix) | |
## now create an empty distance matrix | |
log2.distance.matrix <- matrix(0, | |
nrow=ncol(log2.data.matrix), | |
ncol=ncol(log2.data.matrix), | |
dimnames=list(colnames(log2.data.matrix), | |
colnames(log2.data.matrix))) | |
log2.distance.matrix | |
## now compute the distance matrix using avg(absolute value(log2(FC))) | |
for(i in 1:ncol(log2.distance.matrix)) { | |
for(j in 1:i) { | |
log2.distance.matrix[i, j] <- | |
mean(abs(log2.data.matrix[,i] - log2.data.matrix[,j])) | |
} | |
} | |
log2.distance.matrix | |
## do the MDS math (this is basically eigen value decomposition) | |
## cmdscale() is the function for "Classical Multi-Dimensional Scalign" | |
mds.stuff <- cmdscale(as.dist(log2.distance.matrix), | |
eig=TRUE, | |
x.ret=TRUE) | |
## calculate the percentage of variation that each MDS axis accounts for... | |
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1) | |
mds.var.per | |
## now make a fancy looking plot that shows the MDS axes and the variation: | |
mds.values <- mds.stuff$points | |
mds.data <- data.frame(Sample=rownames(mds.values), | |
X=mds.values[,1], | |
Y=mds.values[,2]) | |
mds.data | |
ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + | |
geom_text() + | |
theme_bw() + | |
xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) + | |
ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) + | |
ggtitle("MDS plot using avg(logFC) as the distance") |