# Relative expression levels of the A, B, and D genome homoeologs across triads
This analysis focused exclusively on the gene triads which had a 1:1:1 correspondence across the three homoeologous genomes, including 17,400 syntenic and 1,074 non-syntenic triads (total of 18,474 triads or 55,422 genes). Again, starting from the subset of genes considered expressed using the initial 850 filter criterion, we defined a triad as expressed when the sum of the A, B, and D genome homoeologs was > 0.5 TPM. This allowed us to include triads in which, for example, only a single homoeolog was expressed, and which could later be classified as a dominant triad. Using this criteria, we defined a total of 53,259 genes (17,753 triads) which were considered expressed (Table S3).

To standardize the relative expression of each homoeolog across the triad, we normalized the absolute TPM for each gene within the triad as follows:$$expression(A) = \frac{TPM(A)}{TPM(A)+TPM(B)+TPM(D)}$$
             $$expression(B) = \frac{TPM(A)}{TPM(A)+TPM(B)+TPM(D)}$$
             $$expression(D) = \frac{TPM(A)}{TPM(A)+TPM(B)+TPM(D)}$$

   Where A, B, and D represent the gene corresponding to the A, B, and D homoeologs in the triad. The normalized expression was calculated for each one of the intermediate tissues and for the average across all expressed tissues (“combined analysis” as described previously). Fig. S6 shows an example of these calculations for the roots and the combined analysis across three triads. The values of the relative contributions of each genome per triad were used to plot the ternary diagrams using the R package ggtern (57).

In [1]:
library(sqldf)
library(ggplot2)
library(reshape2)
library(fields)
library("gridExtra")
library(ggtern)
library(clue)

Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Loading required package: spam
Loading required package: grid
Spam version 1.4-0 (2016-08-29) is loaded.
Type 'help( Spam)' or 'demo( spam)' for a short introduction 
and overview of this package.
Help for individual functions is also obtained by adding the
suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

Attaching package: ‘spam’

The following objects are masked from ‘package:base’:

    backsolve, forwardsolve

Loading required package: maps
--
Consider donating at: http://ggtern.com
Even small amounts (say $10-50) are very much appreciated!
Remember to cite, run citation(package = 'ggtern') for further info.
--

Attaching package: ‘ggtern’

The following objects are masked from ‘package:gridExtra’:

    arrangeGrob, grid.arrange

The following objects are masked from ‘package:ggplot2’:

    %+%, aes, annotate, calc_element, ggplot, ggplot_build,
    ggplot_gtable, ggplot

In [3]:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                metadata<-read.csv("/Users/ramirezr/Dropbox/JIC/expVIPMetadatas/RefSeq1.0/metadatas/Metadata_june11.txt", row.names = 1, sep="\t")
metadata<-read.csv("./TablesForExploration/metadata.csv")
nrow(metadata)
loadValuesFromExperiment<-function(metadata, folder, unit="tpm", values=c("Development")){
    metadata$Sample.IDs <- gsub("-",".",metadata$Sample.IDs)
    
    v<-values[1]
    v<-gsub(" ","_",v)
    v<-gsub(",",".",v)
    path<-paste0(folder,"/",v,"_",unit,".tsv")
    ret<-read.table(path, row.names = 1, header= TRUE)
    if(length(values) > 1){
      for(i in 2:length(values)){
        v<-values[i]
        v<-gsub(" ","_",v)
        v<-gsub(",",".",v)
        path<-paste0(folder,"/",v,"_",unit,".tsv")
        tmp<-read.table(path, row.names = 1, header= TRUE)
        ret<-cbind(ret,tmp)
      }  
    }
    
    md<-metadata[metadata$Sample.IDs%in%colnames(ret),]
    ret<-ret[,as.character(md$Sample.IDs),]
    list(ret,md)
}
folder<-"./expressionValuesPerGene"
tpms  <-loadValuesFromExperiment(metadata, folder, unit="tpm",  values=unique(metadata$study_title))


metadata_used<-tpms[[2]]
tpms<-tpms[[1]]

nrow(metadata_used)


Loading the homologies

In [5]:
homologies<-read.csv("./TablesForExploration/HCTriads.csv", sep=",")
head(homologies)
nrow(homologies)

group_id,family_name,origin,chrs,cardinality_formal,cardinality_abs,A,B,D,synteny,TE,HC.LC,source
227,OG0021509,ABD,7,1:1:1,1:1:1,TraesCS7A01G243100,TraesCS7B01G148400,TraesCS7D01G241900,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,DAGchainer,i-ADHoRe,MCScanX"
253,OG0021212,ABD,7,1:1:1,1:1:1,TraesCS7A01G360600,TraesCS7B01G267100,TraesCS7D01G362400,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,i-ADHoRe,MCScanX"
255,OG0021210,ABD,7,1:1:1,1:1:1,TraesCS7A01G267100,TraesCS7B01G165300,TraesCS7D01G267800,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,i-ADHoRe"
259,OG0021208,ABD,7,1:1:1,1:1:1,TraesCS7A01G235400,TraesCS7B01G133600,TraesCS7D01G235500,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,i-ADHoRe,MCScanX"
263,OG0021206,ABD,7,1:1:1,1:1:1,TraesCS7A01G185200,TraesCS7B01G090200,TraesCS7D01G186600,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,DAGchainer"
269,OG0021202,ABD,7,1:1:1,1:1:1,TraesCS7A01G155800,TraesCS7B01G059900,TraesCS7D01G156200,segmental homeologs,,HC-only,"OrthoFinder SO,BLAST RBH,DAGchainer,MCScanX"


In [6]:
expressed_genes<-read.csv("./TablesForExploration/expressed_genes_tpmsOver0.5AtLeast8Samples.csv")
colnames(expressed_genes) <- c("gene","tpm", "count")
genes_to_use <- data.frame(gene=expressed_genes$gene)
head(genes_to_use)

gene
TraesCS1A01G000100
TraesCS1A01G000100LC
TraesCS1A01G000200
TraesCS1A01G000200LC
TraesCS1A01G000300
TraesCS1A01G000300LC


In [7]:
nrow(genes_to_use)
hc_genes_to_use<-data.frame(gene=genes_to_use[!grepl("LC$",genes_to_use$gene),])
head(hc_genes_to_use)
nrow(hc_genes_to_use)


gene
TraesCS1A01G000100
TraesCS1A01G000200
TraesCS1A01G000300
TraesCS1A01G000400
TraesCS1A01G000500
TraesCS1A01G000600


In [8]:
getSamplesForFactor<- function(metadata, type="High.level.tissue",factor="roots"){
    ret<-""
    if(type != "all"){
        ret<-as.character(metadata[metadata[,type] == factor,]$Sample.IDs)
    }else{
        ret<-as.character(unique(metadata$Sample.IDs))
    }
    ret
}

In [9]:
getMeansPerFactor<- function(values, metadata,  type="High.level.tissue",factor="roots"){
    samples <- getSamplesForFactor(metadata, type, factor)
    vals <- values[,samples]
    mean<-0
    if(length(samples) == 1){
        print("This factor only has one sample!")
        print(factor)
        mean<-vals
    }else{
        mean<-rowMeans(vals)
    }
    
    
    mean<-sort(mean,decreasing=T)
    cumulative <- cumsum(mean)
    
    cumulative<-data.frame(cumulative)
    mean<-data.frame(mean)
    
    mean$gene <- rownames(mean)
    cumulative$gene <- rownames(cumulative)
    mean$total_samples <- length(samples)
    
    n <-merge(mean,cumulative, by='gene', all=T)
    n <- n[order(n$cumulative,decreasing = F),]
    n$seq <- seq(from = 1, to = nrow(n))
    n$factor = factor
    n
}

In [10]:
getMeansForAllFactors<-function(values, metadata,  type="High.level.tissue"){
    factors<-unique(metadata[,type])
    f<-factors[1]
    meansDFs <- getMeansPerFactor(tpms,metadata,type=type, factor=f)
    for (i in 2:length(factors)){
        f<-factors[i]
        localDF<-getMeansPerFactor(tpms,metadata,type=type, factor=f)
        meansDFs <- rbind(meansDFs,localDF)
    }
    meansDFs
}


In [11]:
isExpressedPerFactor <- function(values, metadata,  type="High.level.tissue",factor="roots", minTPM=0.5){
    samples <- getSamplesForFactor(metadata, type, factor)
    vals <- values[,samples]
    means <- rowMeans(vals)
    expr <- means > minTPM
    m2 <- data.frame( expressed = expr)
    m2$factor<-factor
    m2$transcript<-rownames(m2)
    m2$total_samples <- length(samples)
    m2
}


In [12]:
getExclusiveExpression<-function(values, metadata, minTPM=0.5, type="High.level.tissue"){
    means <- getMeansForAllFactors(values, metadata,type=type)
    means$expressed<-means$mean > minTPM
    exclusiveExpresison<-sqldf("SELECT gene, factor, mean, total_samples 
        FROM means 
        WHERE expressed 
        GROUP BY gene HAVING count(factor) = 1 ")
   list(means, exclusiveExpresison )
}

In [13]:


reshape_triad_groups<-function(triads){
    values<-data.frame(gene=triads$A, chr_group="A", group_id=triads$group_id)
    values<-rbind(values,data.frame(gene=triads$B, chr_group="B", group_id=triads$group_id))
    values<-rbind(values,data.frame(gene=triads$D, chr_group="D", group_id=triads$group_id))
    values
}




In [14]:
get_triad_matrix<-function(normalised_triad, factor="all"){
    tmp<-normalised_triad[factor==normalised_triad$factor,]
    tmp<-acast(tmp, group_id~chr_group, value.var="normalised_triad") 
    tmp<-tmp[complete.cases(tmp),]
    tmp
}
plot_normalized_triads<-function(triads){
    
    p <- ggplot(triads, aes(chr_group, normalised_triad))
    p <- p + geom_boxplot(outlier.alpha = 0.05) 
    p <- p + ylab("Contribution") + xlab("Chromosome group")
    p
}




In [15]:
get_centroids<-function(clust_df){
    
    clusters<-sort(unique(clust_df$clust))
    n_clust<-length(clusters)
    dat <- data.frame(
        A=numeric(n_clust),B=numeric(n_clust), D=numeric(n_clust), size=numeric(n_clust), 
        clust=numeric(n_clust), description=character(n_clust),stringsAsFactors=FALSE  )
    
    for(c in clusters){
        tmp_df_clust<-clust_df[clust_df$clust==c,]
             
        dat[c,1] <- mean(tmp_df_clust$A)
        dat[c,2] <- mean(tmp_df_clust$B)
        dat[c,3] <- mean(tmp_df_clust$D)
        dat[c,4] <- nrow(tmp_df_clust)
        dat[c,5] <- c
        dat[c,6] <- paste0("other.",c)
    }
    rownames(dat)<-clusters
    dat
}


In [16]:
plot_clust_dist<-function(clust_df, title="All"){
    tmp_df<-clust_df[,c("A","B","D","group_id","clust","description")]
    tmp_df<-melt(tmp_df,id=c("group_id","clust","description"))
    colnames(tmp_df)<-c("group_id","clust","description","chr_group","normalised_triad")
    
    clusters<-sort(unique(tmp_df$description))
    tern <- ggtern(clust_df,aes(A,B,D,color=description, alpha=0.01)) + 
       geom_point() + theme_legend_position(x = "topleft") + scale_alpha(guide = 'none')
    gs<-list(tern)
    dat <- data.frame(
        A=numeric(0),B=numeric(0), D=numeric(0), size=numeric(0),stringsAsFactors=FALSE ) 
    
    
    rownames(dat)<-rownames(clusters)
    for(c in clusters){
        tmp_df_clust<-tmp_df[tmp_df$description==c,]
        p<-plot_normalized_triads(tmp_df_clust)
        p<- p + ylim(0,1)
        p<- p + ylab("") + xlab("")
        p<- p + ggtitle(c)
        dat[c,1] <- round(100*mean(tmp_df_clust[tmp_df_clust$chr_group=="A","normalised_triad"]),digits=2)
        dat[c,2] <- round(100*mean(tmp_df_clust[tmp_df_clust$chr_group=="B","normalised_triad"]),digits=2)
        dat[c,3] <- round(100*mean(tmp_df_clust[tmp_df_clust$chr_group=="D","normalised_triad"]),digits=2)
        dat[c,4] <- nrow(tmp_df_clust)
        gs[[length(gs)+1]] <- p
    }
    
    total_size<-sum(dat$size)
    dat$percentage<-round(100*dat$size/total_size,digits=2)
    
    gs[[length(gs)+1]]<-tableGrob(dat)
    lay <- rbind(c( 1, 1, 1, 2, 4, 7),
                 c( 1, 1, 1, 3, 5, 8),
                 c( 9, 9, 9, 6,NA,NA)
                 )

    g2 <- arrangeGrob(grobs = gs, layout_matrix = lay, top = title)
    g2
}


#head(mat_centroid_df)
#head(mat_centroid_df[,c("A","B","D")])
#grid.draw(plot_clust_dist(mat_centroid_df))

In [17]:
get_means_df<-function(metadata, tpms, type="High.level.tissue", min_mean_tpm=0.5){
    samples<-getSamplesForFactor(metadata, type="all",factor="all")
    values<-data.frame(value=numeric(nrow(tpms)),stringsAsFactors=FALSE)
    if(length(samples) > 1){
        print("All samples:")
        #print(samples)
        #print(colnames(tpms))
        values$value<-rowMeans(tpms[,samples])
    }else{
        values$value<-tpms[,samples]
    }
     
    values$factor<-"all"
    values$gene<-rownames(tpms)
    values$samples<-length(samples)
    
    print(unique(metadata[,type]))
    
    for(f in as.character(unique(metadata[,type]))){
        print(f)
        samples<-getSamplesForFactor(metadata, type=type,factor=f)
        
        tmp<-data.frame(value=numeric(nrow(tpms)),stringsAsFactors=FALSE)
        
        if(length(samples) > 1){
            tmp$value<-rowMeans(tpms[,samples])
        }else{
            tmp$value<-tpms[,samples]
        }
        
        tmp$factor<-f
        tmp$gene<-rownames(tpms)
        tmp$samples<-length(samples)
        print(colnames(values))
        print(colnames(tmp))
        values<-rbind(values,tmp)
    }
    
    casted<-dcast(values, gene~factor, value.var="value")
    casted$all<-NULL
   
    rownames(casted)<-casted$gene
    casted$gene<-NULL
    casted<-as.matrix(casted)
    print(head(casted))
    tmp<-data.frame(value=numeric(nrow(casted)),stringsAsFactors=FALSE)
    
    tmp$value<-rowMeans(casted)
    tmp$factor<-"all_means"
    tmp$gene<-rownames(casted)
    tmp$samples<-ncol(casted)
    print("...")
    print(colnames(values))
    print(colnames(tmp))
    values<-rbind(values,tmp)
    
    print(head(casted))
    print(head(min_mean_tpm))
    casted<-ifelse(casted < min_mean_tpm, NA, casted)
    print(head(casted))
    
    tmp<-data.frame(value=numeric(nrow(casted)),stringsAsFactors=FALSE)
    tmp$value<-rowMeans(casted, na.rm = TRUE)
   
    tmp$factor<-"all_mean_filter"
    tmp$gene<-rownames(casted)
    tmp$samples<-rowSums(!is.na(casted))
    print("...")
    print(colnames(values))
    print(colnames(tmp))
    
    values<-rbind(values,tmp)
    values
}

In [18]:
get_means_df<-function(metadata, tpms, triads, type="High.level.tissue"){
    samples<-getSamplesForFactor(metadata, type="all",factor="all")
    values<-data.frame(value=numeric(nrow(tpms)),stringsAsFactors=FALSE)
    if(length(samples) > 1){
        values$value<-rowMeans(tpms[,samples])
    }else{
        values$value<-tpms[,samples]
    }
     
    values$factor<-"all"
    values$gene<-rownames(tpms)
    values$samples<-length(samples)
    
    #print(unique(metadata[,type]))
    
    for(f in unique(metadata[,type])){
        #print(f)
        samples<-getSamplesForFactor(metadata, type=type,factor=f)
        
        tmp<-data.frame(value=numeric(nrow(tpms)),stringsAsFactors=FALSE)
        
        if(length(samples) > 1){
            tmp$value<-rowMeans(tpms[,samples])
        }else{
            tmp$value<-tpms[,samples]
        }
        
        tmp$factor<-f
        tmp$gene<-rownames(tpms)
        tmp$samples<-length(samples)
        values<-rbind(values,tmp)
    }
    
    casted<-dcast(values, gene~factor, value.var="value")
    casted$all<-NULL
   
    rownames(casted)<-casted$gene
    casted$gene<-NULL
    casted<-as.matrix(casted)
 
    tmp<-data.frame(value=numeric(nrow(casted)),stringsAsFactors=FALSE)
    tmp$value<-rowMeans(casted)
    tmp$factor<-"all_means"
    tmp$gene<-rownames(casted)
    tmp$samples<-ncol(casted)
    values<-rbind(values,tmp)
    
    triads_flat<-reshape_triad_groups(triads)
    #print(nrow(values))
    vals<-sqldf("SELECT `values`.*, chr_group, group_id FROM `values` 
LEFT JOIN triads_flat on `values`.gene = triads_flat.gene")
    vals
}

In [19]:

get_triad_details<-function(normalised_triad, factor="all",min_triad_sum=1){
    tmp<-normalised_triad[factor==normalised_triad$factor,]
    tmp<-tmp[tmp$triad_sum>min_triad_sum,]
    norm_triad_mat<-get_triad_matrix(tmp, factor=factor)
    centroid<-t(as.matrix(colMeans(norm_triad_mat,)))
    
    dists<-rdist(norm_triad_mat,centroid)
    rownames(dists)<-rownames(norm_triad_mat)
    colnames(dists)<-c("Distance")
    
    ranked_dists<-as.matrix(rank(dists)/length(dists))
    rownames(ranked_dists)<-rownames(dists)
    colnames(ranked_dists)<-c("P rank")
    rank_df<-data.frame(dists,ranked_dists)  
    rank_df$group_id<-rownames(rank_df)
    ranked_triads<-sqldf("SELECT tmp.*, Distance, `P.rank` FROM tmp NATURAL JOIN rank_df ")
    
    t_centroid <- tableGrob(round(centroid*100,digits=2))
    p_dist<-ggplot(data=rank_df, aes(Distance)) + geom_histogram(bins=100,aes(y =..density..)) +geom_density(col=2) 
    p_rank<-ggplot(rank_df,aes(Distance,P.rank))+  geom_bin2d(bins = 75)+theme(legend.position="bottom")
    p_genome_dist<-plot_normalized_triads(tmp)
    
    s<-sample(1:nrow(rank_df), 1500,  replace=FALSE)
    rank_s <- rank_df[s,]
    title<-paste0("Genome contribution in triads: ", factor)
    p<-arrangeGrob(p_genome_dist, p_dist, t_centroid, p_rank
             , ncol=2, top = title)
    
    list(distances=rank_df,triad=tmp ,centroid=centroid, p_dist=p_dist, dist_sample=rank_s, 
         p_rank=p_rank, p_genome_dist=p_genome_dist, plot=p, ranked_triads=ranked_triads, matrix=norm_triad_mat) 
}


get_clusters_by_distance_and_plots<-function(normalized_triads, 
                                             factor="all", 
                                             output_prefix="",
                                             min_triad_sum=1,
                                             title="Triads"){
    f<-factor
    triad_test<-get_triad_details(normalized_triads,factor=factor,min_triad_sum=min_triad_sum)
    triad<-triad_test$ranked_triads
    
    test_mat<-triad_test$matrix
    mat_df<-data.frame(test_mat)
    mat_df$group_id<-rownames(test_mat)

    title<-paste0(title, ": " , f)
    
    centers<-t(matrix(c(0.33,0.33,0.33,1,0,0,0,1,0,0,0,1,0,0.5,0.5,0.5,0,0.5,0.5,0.5,0), nrow=3))
    colnames(centers)<-c("A","B","D")
    rownames(centers)<-c("Central","A.dominant","B.dominant","D.dominant","A.suppressed","B.suppressed","D.suppressed")
    
    expectation_distance<-rdist(test_mat,centers)
    colnames(expectation_distance)<-c("Central",
                                      "A.dominant",  "B.dominant",  "D.dominant",
                                      "A.suppressed","B.suppressed","D.suppressed")
    rownames(expectation_distance)<-rownames(test_mat)
    triad_test$expectation_distance<-expectation_distance
    mins<-apply( expectation_distance, 1, which.min)
    clust_desc<-colnames(expectation_distance)
    name_mins<-clust_desc[mins]
    
    
    general_desc<-c("Central","Dominant",  "Dominant",  "Dominant",
                    "Suppressed","Suppressed","Suppressed")
    
    general_name_mins<-general_desc[mins]
    
    mat_df$clust<-mins
    mat_df$description<-name_mins
    mat_df$general_description<-general_name_mins
    
    
    mat_df<-cbind(mat_df,expectation_distance)
    triad_test$triad<-mat_df
    centroids<-get_centroids(mat_df)
    
    centroids$description<-c("Central","A.dominant","B.dominant","D.dominant",
                             "A.suppressed","B.suppressed","D.suppressed" 
                             )
    
    total_genes<-sum(centroids$size)
    centroids[8,"A"]<-triad_test$centroid[,"A"]
    centroids[8,"B"]<-triad_test$centroid[,"B"]
    centroids[8,"D"]<-triad_test$centroid[,"D"]
    centroids[8,"description"] <- "Global"
    centroids[8,"size"]<-total_genes
    centroids[8,"clust"]<-8
    centroids$factor<-factor
    centroids$percentage<-centroids$size/total_genes
    
    p<-plot_clust_dist(mat_df, title=title)
    
    table_save<-sqldf("SELECT * FROM mat_df
    NATURAL LEFT JOIN triad   
    ORDER BY triad.group_id, triad.chr_group")
    table_save<-table_save[ , !(names(table_save) %in% c("A","B","D"))]
    table_save$min_triad_sum<-min_triad_sum
    triad_test$triad<-table_save
    triad_test$centroids<-centroids
    triad_test$plot_cluster<-p
    centroids$samples<-mean(triad_test$triad$samples)
    centroids$min_triad_sum<-min_triad_sum
    if(length(output_prefix) > 0){
        f2<-gsub("\n","-",f)
        f2<-gsub("/","_",f)
        p_filename<-paste0(output_prefix,"_",f2,"_triad_dist.pdf")
        ggsave(p_filename,plot=triad_test$plot,  width = 30, height = 25, units = c("cm"))
                            
        p_filename<-paste0(output_prefix,"_",f2,"_triad_cluster.pdf")
        ggsave(p_filename,plot=p,  width = 30, height = 25, units = c("cm"))
                            
        t_filename<-paste0(output_prefix,"_",f2,"_triad_cluster.txt")
        write.table(table_save, file=t_filename,
            sep="\t",quote=TRUE,row.names=FALSE, na="" )
        
        t_filename<-paste0(output_prefix,"_",f2,"_triad_centroids.txt")
        write.table(centroids, file=t_filename,
            sep="\t",quote=TRUE,row.names=FALSE, na="")
    }
    triad_test
}


ks.central<-function(meanTPMS, factor="all"){
    m<-meanTPMS[meanTPMS$factor==factor,c("description","chr_group","value")]
    
    test_df<-NULL
    for(gr in c("A","B","D")){
        central<-m[m$description=="Central" & m$chr_group==gr, "value"]
        for(d in unique(meanTPMS$description)){
            compare<-m[m$description==d & m$chr_group==gr, "value"]
            for(alt in c("less","greater")){
                test<-ks.test(central, compare, alternative=alt)
                test$chr_grp<-gr
                test$compare<-d
                test$alternative<-alt
                tmp<-data.frame(t(unlist(test)),stringsAsFactors = FALSE)
                if(is.null(test_df)){
                    test_df<-tmp
                }else{
                    colnames(tmp)<-colnames(test_df)
                    test_df<-rbind(test_df,tmp)
                }
            }
            
        }
    }
    #print(test_df$p.value)
    test_df$fdr_by<-p.adjust(as.numeric(test_df$p.value),method="BY")
    test_df$bonferroni<-p.adjust(as.numeric(test_df$p.value),method="bonferroni")
    test_df$factor<-factor
    test_df
}

plotMeansTPM<-function(meansTPM, title="Test", filename="test"){
    m1<-meansTPM[meansTPM$factor=="all_mean_filter",]
    m2<-meansTPM[meansTPM$factor!="all",]
    m2<-m2[m2$factor!="all_means",]
    m2<-m2[m2$factor!="all_mean_filter",]
    
    p <- ggplot(m1, aes(description, value, fill=chr_group))
    p <- p + geom_boxplot(outlier.alpha = 0.05) 
    p <- p + ylim(c(0, 50))
    p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1),
                  legend.position="bottom")
    p1 <- p + ylab("TPM") + xlab("Category") 

    
    p <- ggplot(m2, aes(description, value))
    p <- p + geom_boxplot(outlier.alpha = 0.05) 
    p <- p + ylim(c(0, 50)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
    p <- p + facet_wrap(~ factor,  drop = TRUE)
    
    p2 <- p + ylab("TPM") + xlab("Category") 
    
    p <- ggplot(m1, aes(description, value))
    p <- p + geom_boxplot(outlier.alpha = 0.05) 
    p <- p + ylim(c(0, 50)) 
    p  <-p + theme(axis.text.x = element_text(angle = 90, hjust = 1),
                  legend.position="bottom")
    p3 <- p + ylab("TPM") + xlab("") 

    
    p <- ggplot(m1, aes(general_description, value, fill=chr_group))
    p <- p + geom_boxplot(outlier.alpha = 0.05) 
    p <- p + ylim(c(0, 50))
    p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1),
                  legend.position="bottom")
    p4 <- p + ylab("TPM") + xlab("Category") 
    
    lay <- rbind(c( 3,2,2,4),
                 c( 1,2,2,NA))
    
    g2 <- arrangeGrob(p1,p2, p3,p4,layout_matrix = lay, top = title)
    
    
    p_filename<-paste0(filename,"_TPM_by_triad_disribution.pdf")
    ggsave(p_filename,plot=g2,  width = 30, height = 25, units = c("cm"))
}


get_normalized_triad_from_clusters<-function(meansTPM, triads){
    
    m2<-meansTPM[meansTPM$factor!="all",]
    m2<-m2[m2$factor!="all_means",]
    tpms<-dcast(m2, gene~factor, value.var="value",drop=FALSE, fill=NA)
    rownames(tpms)<-tpms$gene
    tpms$gene<-NULL
    
    values<-data.frame(value=numeric(nrow(tpms)),stringsAsFactors=FALSE)
   
    values$value<-rowMeans(tpms, na.rm = TRUE)
   
    values$factor<-"all_mean_filter"
    values$gene<-rownames(tpms)
    values$samples<-rowSums(!is.na(tpms))
    
    triads_flat<-reshape_triad_groups(triads)
    
    means_group<-sqldf("SELECT `values`.*, chr_group, group_id FROM `values` 
LEFT JOIN triads_flat on `values`.gene = triads_flat.gene")
    
    sums_per_group<-sqldf("SELECT group_id , factor, sum(value) as triad_sum 
FROM means_group GROUP BY group_id , factor")
    
    normalized_triads<-sqldf("SELECT means_group.*, triad_sum, value/triad_sum as normalised_triad 
FROM means_group LEFT JOIN sums_per_group 
ON  sums_per_group.group_id = means_group.group_id 
AND sums_per_group.factor = means_group.factor ")
    
    normalized_triads
}



get_triads_and_plots<-function(metadata,tpms, homologies, genes_to_use, 
                               type="High.level.tissue", 
                               dataset="All",
                               folder="./Figures/GenomeDominance/latest", 
                               min_triad_sums=c(10), 
                               min_mean_triad_sums=c(0.5) ){
    triadas_with_genes<-sqldf("SELECT * from homologies 
WHERE synteny='segmental homeologs' AND cardinality_abs = '1:1:1'
AND 
(A in genes_to_use 
OR B in genes_to_use 
OR D in genes_to_use )
")

    #print(head(homologies))
    #print(nrow(triadas_with_genes))
    #print(head(genes_to_use))
    tpms_for_triads<-tpms[c(as.character(triadas_with_genes$A), 
         as.character(triadas_with_genes$B),
         as.character(triadas_with_genes$D)),]
    print(nrow(tpms_for_triads))
    #print(head(tpms_for_triads))
    means_group<-get_means_df(metadata, tpms_for_triads,triadas_with_genes, type=type)
    #means_group<-get_means_df(metadata, tpms_for_triads,type=type, min_mean_tpm=0.5)
    
    sums_per_group<-sqldf("SELECT group_id , factor, sum(value) as triad_sum 
FROM means_group GROUP BY group_id , factor")
    
    normalized_triads<-sqldf("SELECT means_group.*, triad_sum, value/triad_sum as normalised_triad 
FROM means_group LEFT JOIN sums_per_group 
ON  sums_per_group.group_id = means_group.group_id 
AND sums_per_group.factor = means_group.factor ")
    
    
    path<-paste0(folder,"/",dataset,"/",type,"/")
    
    for(min_triad_sum in min_triad_sums){
        local_triads<-NULL
        ks_ret<-NULL
        centroids<-NULL
        triads<-NULL
        path<-paste0(folder,"/",dataset,"/",type,"/min_tpm_sum_",min_triad_sum,"/")
        path<-gsub(" ","_",path)
        dir.create(path, showWarnings = TRUE, recursive = TRUE, mode = "0777")
        path<-paste0(folder,"/",dataset,"/",type,"/min_tpm_sum_",min_triad_sum,"/min_tpm_sum_",min_triad_sum)
        
        for(f in unique(normalized_triads$factor)){
            title<-paste0(dataset," triads:" , f, ".\nMinimum triad TPM sum: ", min_triad_sum)
            clusters<-get_clusters_by_distance_and_plots(normalized_triads, factor=f, title=title, 
                                                         min_triad_sum=min_triad_sum,
                           output_prefix=path)
            suppressWarnings(
                tmp<-ks.central(clusters$triad, factor=f)
            )
            if(is.null(centroids)){
                centroids<-clusters$centroids
                triads<-clusters$triad
               
                ks_ret<-tmp
            }else{
                centroids<-rbind(centroids,clusters$centroids)
                triads<-rbind(triads,clusters$triad) 
                 
                ks_ret<-rbind(ks_ret, tmp)
            }
           
        }
        
        
        normalized_triads_filter<-get_normalized_triad_from_clusters(triads,triadas_with_genes )
        title<-paste0(dataset," triads:" , f, ".\nMinimum triad TPM sum: ", min_triad_sum)
        
        
        clusters<-get_clusters_by_distance_and_plots(normalized_triads_filter, factor="all_mean_filter", 
                                                     title=title, 
                                                     min_triad_sum=min_triad_sum,
                                                     output_prefix=path)
        
        centroids<-rbind(centroids,clusters$centroids)
        triads<-rbind(triads,clusters$triad) 
        
        suppressWarnings(
            tmp<-ks.central(clusters$triad, factor="all_mean_filter")
        )
        ks_ret<-rbind(ks_ret, tmp)
        
        t_filename<-paste0(path,"min_tpm_sum",min_triad_sum)
        title<-paste0("Category distribution ", dataset, "\n", type, "\n", "Min TPM average sum: ", min_triad_sum)
        plotMeansTPM(triads, filename=t_filename, title=title)
        
        t_filename<-paste0(path, "min_tpm_sum_", min_triad_sum, "_ks.txt")
        write.table(ks_ret, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
        t_filename<-paste0(path,"summary_triad_centroids.txt")
        write.table(centroids, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
        t_filename<-paste0(path,"summary_triad_cluster.txt")
        write.table(triads, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
    }
}
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               metadata<-read.csv("/Users/ramirezr/Dropbox/JIC/expVIPMetadatas/RefSeq1.0/metadatas/Metadata_june11.txt", row.names = 1, sep="\t")
      


metadata <- metadata_used[metadata_used[,"Variety"] == "Chinese Spring",]
metadata <- metadata[metadata[,"High.level.stress.disease"] == "none",]
      
nrow(metadata)
get_triads_and_plots(metadata,tpms,homologies,hc_genes_to_use, type="High.level.tissue",
                               dataset="HC_CS_no_stress" , min_triad_sum=c(1))



[1] 51108


“Removed 1653 rows containing non-finite values (stat_boxplot).”

In [20]:
datasets<-read.csv("./TablesForExploration/samples_to_use.csv")
head(datasets)

Sample.IDs,subset,type_to_use
Sample_10,Development,Intermediate
Sample_18,Development,Intermediate
Sample_3A,Development,Intermediate
Sample_12,Development,Intermediate
Sample_26,Development,Intermediate
Sample_6A,Development,Intermediate


In [21]:
subsets<-unique(datasets$subset)

In [22]:

get_triads_and_plots<-function(metadata,tpms, homologies, genes_to_use, 
                               type="High.level.tissue", 
                               dataset="All",
                               folder="./Figures/GenomeDominance/latest", 
                               min_triad_sums=c(10), 
                               min_mean_triad_sums=c(0.5) ){
    triadas_with_genes<-sqldf("SELECT * from homologies 
WHERE synteny='segmental homeologs' AND cardinality_abs = '1:1:1'
AND 
(A in genes_to_use 
OR B in genes_to_use 
OR D in genes_to_use )
")
    type<-as.character(type)
   
    tpms_for_triads<-tpms[c(as.character(triadas_with_genes$A), 
         as.character(triadas_with_genes$B),
         as.character(triadas_with_genes$D)),]
  
    means_group<-get_means_df(metadata, tpms_for_triads,triadas_with_genes, type=type)
    
    sums_per_group<-sqldf("SELECT group_id , factor, sum(value) as triad_sum 
FROM means_group GROUP BY group_id , factor")
    
    normalized_triads<-sqldf("SELECT means_group.*, triad_sum, value/triad_sum as normalised_triad 
FROM means_group LEFT JOIN sums_per_group 
ON  sums_per_group.group_id = means_group.group_id 
AND sums_per_group.factor = means_group.factor ")
    
    
    path<-paste0(folder,"/",dataset,"/",type,"/")
    
    for(min_triad_sum in min_triad_sums){
        local_triads<-NULL
        ks_ret<-NULL
        centroids<-NULL
        triads<-NULL
        path<-paste0(folder,"/",dataset,"/",type,"/min_tpm_sum_",min_triad_sum,"/")
        path<-gsub(" ","_",path)
        dir.create(path, showWarnings = TRUE, recursive = TRUE, mode = "0777")
        path<-paste0(folder,"/",dataset,"/",type,"/min_tpm_sum_",min_triad_sum,"/min_tpm_sum_",min_triad_sum)
        
        for(f in unique(normalized_triads$factor)){
            title<-paste0(dataset," triads:" , f, ".\nMinimum triad TPM sum: ", min_triad_sum)
            clusters<-get_clusters_by_distance_and_plots(normalized_triads, factor=f, title=title, 
                                                         min_triad_sum=min_triad_sum,
                           output_prefix=path)
            suppressWarnings(
                tmp<-ks.central(clusters$triad, factor=f)
            )
            if(is.null(centroids)){
                centroids<-clusters$centroids
                triads<-clusters$triad
               
                ks_ret<-tmp
            }else{
                centroids<-rbind(centroids,clusters$centroids)
                triads<-rbind(triads,clusters$triad) 
                 
                ks_ret<-rbind(ks_ret, tmp)
            }
           
        }
        
        
        normalized_triads_filter<-get_normalized_triad_from_clusters(triads,triadas_with_genes )
        title<-paste0(dataset," triads:" , f, ".\nMinimum triad TPM sum: ", min_triad_sum)
        
        
        clusters<-get_clusters_by_distance_and_plots(normalized_triads_filter, factor="all_mean_filter", 
                                                     title=title, 
                                                     min_triad_sum=min_triad_sum,
                                                     output_prefix=path)
        
        centroids<-rbind(centroids,clusters$centroids)
        triads<-rbind(triads,clusters$triad) 
        
        suppressWarnings(
            tmp<-ks.central(clusters$triad, factor="all_mean_filter")
        )
        ks_ret<-rbind(ks_ret, tmp)
        
        t_filename<-paste0(path,"min_tpm_sum",min_triad_sum)
        title<-paste0("Category distribution ", dataset, "\n", type, "\n", "Min TPM average sum: ", min_triad_sum)
        plotMeansTPM(triads, filename=t_filename, title=title)
        
        t_filename<-paste0(path, "min_tpm_sum_", min_triad_sum, "_ks.txt")
        write.table(ks_ret, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
        t_filename<-paste0(path,"summary_triad_centroids.txt")
        write.table(centroids, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
        t_filename<-paste0(path,"summary_triad_cluster.txt")
        write.table(triads, file=t_filename, sep="\t",quote=TRUE,row.names=FALSE, na="")
    }
}

for(s in subsets){
    print(s)
    samples<-datasets[datasets$subset == s, ]
    type_to_use <- unique(samples$type_to_use)
    print(type_to_use)
    metadata <- metadata_used[ metadata_used$Sample.IDs %in% samples$Sample.IDs, ]
    path <- paste0("./Figures/GenomeDominance/latest/HC_", s)
    dir.create(path)
    path <- paste0(path, "/",type_to_use,".txt")
    write.table(metadata, file=path,
            sep="\t",quote=TRUE,row.names=FALSE, na="")
    print(nrow(metadata))
    ds<-paste0("HC_",s)
    get_triads_and_plots(metadata,tpms,homologies,hc_genes_to_use, 
                         type=type_to_use, 
                               dataset=ds, min_triad_sum=c(0.5))
    
}

[1] "Development"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 209


“Removed 1873 rows containing non-finite values (stat_boxplot).”

[1] "850_samples"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 850


“Removed 1553 rows containing non-finite values (stat_boxplot).”

[1] "CS_no_stress"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control


“'./Figures/GenomeDominance/latest/HC_CS_no_stress' already exists”

[1] 123


“Removed 1549 rows containing non-finite values (stat_boxplot).”

[1] "CS_NB_inc_stress"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 144


“Removed 1539 rows containing non-finite values (stat_boxplot).”

[1] "abiotic"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 50


“Removed 1411 rows containing non-finite values (stat_boxplot).”

[1] "disease"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 163


“Removed 1777 rows containing non-finite values (stat_boxplot).”

[1] "grain"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 119


“Removed 875 rows containing non-finite values (stat_boxplot).”

[1] "leaf"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 245


“Removed 1649 rows containing non-finite values (stat_boxplot).”

[1] "root"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 45


“Removed 1850 rows containing non-finite values (stat_boxplot).”

[1] "spike"
[1] Intermediate
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 128


“Removed 1806 rows containing non-finite values (stat_boxplot).”

[1] "abiotic_merged_control"
[1] Intermediate_Stress_merged_control
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 50


“Removed 1446 rows containing non-finite values (stat_boxplot).”

[1] "disease_merged_control"
[1] Intermediate_Stress_merged_control
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 163


“Removed 1832 rows containing non-finite values (stat_boxplot).”

[1] "stress_control"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 77


“Removed 1562 rows containing non-finite values (stat_boxplot).”

[1] "abiotic_stress_control"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 13


“Removed 1226 rows containing non-finite values (stat_boxplot).”

[1] "disease_stress_control"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 64


“Removed 1692 rows containing non-finite values (stat_boxplot).”

[1] "abiotic_stress"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 34


“Removed 1450 rows containing non-finite values (stat_boxplot).”

[1] "disease_stress"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 106


“Removed 1808 rows containing non-finite values (stat_boxplot).”

[1] "stress"
[1] Intermediate_Stress
3 Levels: Intermediate ... Intermediate_Stress_merged_control
[1] 140


“Removed 1659 rows containing non-finite values (stat_boxplot).”

In [26]:
read_cluster_triads<-function(dir="./Figures/GenomeDominance/latest", min_tpm_sum=1, dataset="HC_CS_no_stress", type="High.level.tissue"){
    path<-paste0(dir,"/",dataset,"/",type,"/min_tpm_sum_", min_tpm_sum,
               "/min_tpm_sum_",min_tpm_sum,
               "summary_triad_cluster.txt")
    table<-read.csv(path, sep="\t")  
    table<-table[table$factor!="all",]
    table<-table[table$factor!="all_means",]
    table
}

In [27]:
getMovementForTriad<-function(triad_mat, group_id){
    local_triad<-triad_mat[triad_mat$group_id==group_id,]
    local_triad<-local_triad[local_triad$factor=="all_mean_filter",]
    category<-local_triad$description
    local_centroid<-as.matrix(local_triad[local_triad$factor=="all_mean_filter",c("A","B","D")])
    
    local_triad<-triad_mat[triad_mat$group_id==group_id,]
    local_triad<-local_triad[local_triad$factor!="all_mean_filter" ,]
    local_mat<-as.matrix(local_triad[,c("A","B","D")])
    
    rownames(local_mat)<-local_triad$factor
    
    
    dists<-rdist(local_mat, local_centroid)
    f_dists<-rdist(local_mat)
   # print(local_mat)
    #hull<-convhulln(local_mat,options="Qc")
    #print(hull)
    
    ret<-list(group_id=group_id, 
              factor_count=nrow(local_triad),
              central_total_distance=sum(dists),  
              central_mean_distance=mean(dists),
              central_max_distance=max(dists),
              central_sd_distance=sd(dists),
              central_max_over_mean=max(dists)/mean(dists),
              
              factor_total_distance=sum(f_dists),  
              factor_mean_distance=mean(f_dists),
              factor_max_distance=max(f_dists),
              factor_sd_distance=sd(f_dists),
              factor_max_over_mean=max(f_dists)/mean(f_dists),
              category=as.character(category),
              sum_mean_tpm=sum(local_triad$triad_sum),
              total_categories=length(unique(local_triad$description)),
              categories=    paste(sort(unique(local_triad$description)), collapse=", "),
              Central       = sum(local_triad$description == "Central"),
              A.dominant    = sum(local_triad$description == "A.dominant"),
              A.suppressed  = sum(local_triad$description == "A.suppressed"),
              B.dominant    = sum(local_triad$description == "B.dominant"),
              B.suppressed  = sum(local_triad$description == "B.suppressed"),
              D.dominant    = sum(local_triad$description == "D.dominant"),
              D.suppressed  = sum(local_triad$description == "D.suppressed")
              
             )
    ret
}

In [28]:
res<-NULL
triads<-NULL
base_path<-"./Figures/GenomeDominance/latest/"
for(s in subsets){
    print(s)
    
    samples<-datasets[datasets$subset == s, ]
    type <- unique(samples$type_to_use)
    min <-0.5
    dataset<-paste0("HC_",s)
    triads_t<-read_cluster_triads(min=min, type=type,dataset=dataset)
    triads_t$dataset<-dataset
    triads_df<-dcast(triads_t, group_id+factor+description+triad_sum~chr_group, value.var="normalised_triad")
    res_t<-sapply(unique(triads_df$group_id), 
                  function(x){
                      getMovementForTriad(triads_df,x)
                  })
    res_t<-data.frame(t(res_t))
    res_t$dataset<-dataset
    res_t$dataset<-as.factor(res_t$dataset)
    for(c in colnames(res_t)){
        res_t[,c]<-unlist(res_t[,c])
    }
    
    head(res_t)    
    #path<-paste0(base_path,dataset,"/",type,"/min_tpm_sum_",min,"/triadMovement.csv")
    #write.csv(res_t, path, row.names=FALSE)
    if(is.null(res)){
        res<-res_t
        triads<-triads_t
    }else{
        res<-rbind(res,res_t)
        triads<-rbind(triads,triads_t)
    }
}

[1] "Development"
[1] "850_samples"
[1] "CS_no_stress"
[1] "CS_NB_inc_stress"
[1] "abiotic"
[1] "disease"
[1] "grain"
[1] "leaf"
[1] "root"
[1] "spike"
[1] "abiotic_merged_control"
[1] "disease_merged_control"
[1] "stress_control"
[1] "abiotic_stress_control"
[1] "disease_stress_control"
[1] "abiotic_stress"
[1] "disease_stress"
[1] "stress"


In [29]:
saveRDS(triads, "./TablesForExploration/Triads.rds")
saveRDS(res,"./TablesForExploration/TriadMovement.rds" )

