In [1]:
library(tidyverse)
library(viridis)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.6     v dplyr   1.0.7
v tidyr   1.1.4     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:viridisLite':

    viridis.map



In [2]:
setwd("/data/projects/julia.pratt/CS1_genomeSelection/all_rootstock_comparisons/")
fileNames <- Sys.glob("*.csv")

In [3]:
getwd()

In [4]:
log2FoldChangeValues <- seq(0, 4, by = 0.05)
columns <- c('filename', 'direction', 'tissue', 'year', 'phenology', 'rootstock', 'log2foldChangeValue', 'numGenes')
df <- data.frame(matrix(nrow = 0, ncol = length(columns)))
colnames(df) <- columns

In [5]:
# filter according to log2foldchange value
log2foldchange <- function(data, alpha, log2FoldChangeVal, d) {
 
    # only keeps significant values
    data.sig <- data %>% filter(padj < alpha)
    
    # separates upregulated/downregulated
    if (d == 'Up') {
        data.adj <- data.sig %>% filter(log2FoldChange > log2FoldChangeVal)
    } else if (d == 'Down') {
         data.adj <- data.sig %>% filter(log2FoldChange < -log2FoldChangeVal)
    } else {
        print(paste(d, "not a valid direction"))
    }

    return(data.adj)
}

In [6]:
# populate the dataframe to look at number of genes by filter
for (fileName in fileNames) {
    # no data for this one - don't use it
    if(!grepl("example", fileName, fixed=TRUE)) {
        # read data
        sample <- read.csv(fileName)
        # separate metadata based on file name
        metasplit <- str_split(fileName, '_')[[1]]
        tissue <- metasplit[1]
        year <- metasplit[2]
        pheno <- metasplit[3]
        rs1 <- str_split(metasplit[4], '-')[[1]][1]
        rs2 <- str_remove(str_split(metasplit[4], '-')[[1]][2], '.csv')
        rs <- paste(rs1, '_', rs2, sep="")
        
        #apply filters to data
        for (direction in c('Up', 'Down')) {
            for (logVal in log2FoldChangeValues) {
                filtered <- log2foldchange(sample, 0.05, logVal, direction)
                nG <- length(filtered[["X"]])
                # populate data frame
                newrow <- data.frame(filename = fileName, 
                                     direction = direction, 
                                     tissue = tissue, 
                                     year = year, 
                                     phenology = pheno, 
                                     rootstock = rs, 
                                     log2foldChangeValue = logVal, 
                                     numGenes = nG)
                df <- rbind(df, newrow)
            }
        } 
    }
}

In [7]:
df$phenology <- factor(df$phenology, levels = c("Anthesis", "Veraison", "Harvest"))
df$rootstock <- factor(df$rootstock, levels = c('Ungrafted_1103P', 'Ungrafted_3309C', 'Ungrafted_SO4', '1103P_3309C', '1103P_SO4', '3309C_SO4'))

In [11]:
write.csv(df, "/data/projects/julia.pratt/CS1_genomeSelection/effectsize.csv")

In [10]:
setwd("/data/projects/julia.pratt/CS1_genomeSelection/figs/")

In [None]:
pdf('log2foldchange_plots_facetdirection.pdf')
for (t in c('Leaf', 'Reproductive')) {
    for (y in c('2017', '2018', '2019')) {
        dfsub <- df %>% filter(tissue == t) %>% filter(year == y)
        title <- paste(t, y)

        p <- ggplot(dfsub, aes(x=log2foldChangeValue, y=numGenes, color=rootstock)) + 
             geom_point() + 
            scale_color_viridis(discrete=TRUE) + 
            theme_bw() + 
            labs(x='log2foldChange Value', y='Number of Genes') + 
            ggtitle(title, subtitle="Number of Genes at Each log2foldChange Value") +
            facet_wrap(~ direction + phenology)
        print(p)
    }
}
dev.off()


In [None]:
pdf('log2foldchange_plots.pdf')
for (t in c('Leaf', 'Reproductive')) {
    for (y in c('2017', '2018', '2019')) {
        dfsub <- df %>% filter(tissue == t) %>% filter(year == y)
        title <- paste(t, y)
        
        p <- ggplot(dfsub, aes(x=log2foldChangeValue, y=numGenes, color=rootstock, shape=direction)) + 
             geom_point() + 
             scale_color_viridis(discrete=TRUE) + 
             scale_shape_manual(values=c(0,1)) +
             theme_bw() + 
             labs(x='log2foldChange Value', y='Number of Genes') + 
             ggtitle(title, subtitle="Number of Genes at Each log2foldChange Value") +
             facet_wrap(~ phenology)
        print(p)
    }
}
dev.off()

In [None]:
df$rootstock <- factor(df$rootstock, 
                       levels=c('Ungrafted_1103P', 
                                'Ungrafted_3309C',
                                'Ungrafted_SO4',
                                '1103P_3309C',
                                '1103P_SO4',
                                '3309C_SO4'))
df$phenology <- factor(df$phenology,
                      levels=c('Anthesis', 
                               'Veraison', 
                               'Harvest'))

In [None]:
for (i in seq(0, 4, by = 0.5)) {
    for (dir in c('Up', 'Down')) {
        name <- paste(paste(i,dir,sep="_"), '.csv', sep="")
        x <- df %>% 
            filter(log2foldChangeValue == i) %>%
            filter(direction==dir) %>%
            select(tissue, year, phenology, rootstock, numGenes) %>%
            arrange(rootstock) %>%
            arrange(phenology) %>%
            pivot_wider(names_from=c('tissue', 'rootstock'), names_sep=".", values_from=numGenes)
        write.csv(x, file=name)

    }
}

In [None]:
dfsub <- df %>% filter(tissue == 'Leaf') %>% filter(year == 2017)
title <- paste('Leaef', '2017')

p <- ggplot(dfsub, aes(x=log2foldChangeValue, y=numGenes, color=rootstock)) + 
     geom_point() + 
    scale_color_viridis(discrete=TRUE) + 
    theme_bw() + 
    labs(x='log2foldChange Value', y='Number of Genes') + 
    ggtitle(title, subtitle="Number of Genes at Each log2foldChange Value") +
    facet_wrap(~ direction + phenology)
print(p)