-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add code for figures, LICENSE and improve readme
- Loading branch information
1 parent
e848d65
commit 40ac153
Showing
29 changed files
with
6,666 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# Biosynthetic potential of the global ocean microbiome ================================== | ||
# Code used to produce the figures and analysis of the paper | ||
# Extended Data Fig. 1A - Depth distrib ================================================== | ||
|
||
# Libraries ------------------------------------------------------------------------------ | ||
|
||
rm(list = ls()) | ||
library(tidyverse) | ||
library(ggimage) | ||
library(facetscales) | ||
for (src_file in list.files('code/analysis/R')){ | ||
print(paste('Sourcing', src_file, '...')) | ||
source(paste('code/analysis/R', src_file, sep = '/'))} | ||
source('code/analysis/lib/Cell_Press_guidelines.R') | ||
source('../../Exploratorium/sushilab-colors/palettes-paoli.R') | ||
|
||
# Prepare data --------------------------------------------------------------------------- | ||
|
||
# Samples data | ||
figure_1B_table = | ||
load_general_metadata() %>% | ||
select(-`Internal Sample Name`) %>% | ||
# Remove GORG for now | ||
# rbind(gorg_metadata_from_local() %>% | ||
# mutate(dataset = "GORG SAGs", | ||
# Sample = paste("GORG", Sample), | ||
# station = paste(station, gsub("HOT[0-9]*|BATS[0-9]*", "", cruise)), | ||
# depth_num = as.numeric(depth), | ||
# depth_layer = add_depth_layers(as.character(depth)), | ||
# `size_fraction` = "N/A", | ||
# `temperature [°C]` = "N/A", | ||
# `oxygen [µmol/kg]` = "N/A") %>% | ||
# select(-cruise, -plate, -links)) %>% | ||
mutate( | ||
size = ifelse(grepl("Time-Series", dataset), "Time-Series", "Survey"), | ||
image = c( | ||
"Biogeotraces" = "code/analysis/lib/icon_circle_biogeotraces.png", | ||
"Malaspina" = "code/analysis/lib/icon_circle_malaspina.png", | ||
"Tara Oceans" = "code/analysis/lib/icon_circle_tara.png", | ||
"Hawaiian Ocean Time-Series" = "code/analysis/lib/icon_clock_hots.png", | ||
"Bermuda-Atlantic Time-Series" = "code/analysis/lib/icon_clock_bats.png", | ||
"GORG SAGs" = "code/analysis/lib/icon_cross_gorg.png" | ||
)[dataset]) %>% | ||
mutate(depth_layer = factor(depth_layer, levels=c("EPI", "MES", "BAT", "ABY"))) | ||
|
||
# Plot samples through depths using transformed scale ------------------------------------ | ||
|
||
trans <- function(x) -sqrt(x) | ||
inv <- function(x) x**2 | ||
|
||
figure_1B = ggplot(figure_1B_table) + | ||
geom_image(aes(x = longitude, y = depth_num, image = image, size = size), asp = 1.85) + | ||
scale_size_manual(values = c(0.01, 0.0166)) + | ||
scale_y_continuous(trans = scales::trans_new("revsqrt_trans", trans, inv, minor_breaks = scales::regular_minor_breaks(reverse = TRUE)), | ||
limits = c(6000, 0), | ||
breaks = c(0, 200, 1000, 2000, 4000, 6000), | ||
expand=c(0,0)) + | ||
scale_x_continuous(limits = c(-180, 180), expand = c(0,0)) + | ||
ylab('Depth (m)') + | ||
xlab('Longitude') + | ||
coord_cartesian(clip = "off") + | ||
theme_bw() + | ||
theme_cell + | ||
theme(rect = element_blank(), | ||
text = element_text(size = unit(6, "pt")), | ||
plot.margin = margin(1, 1, 1, 1, "mm"), | ||
legend.position = "none", | ||
axis.text.y = element_text(size = unit(6, "pt")), | ||
axis.text.x = element_text(size = unit(6, "pt"), hjust = 1), | ||
axis.title.y = element_text(size = unit(6, "pt")), | ||
axis.title.x = element_text(size = unit(6, "pt")), | ||
axis.ticks.length = unit(0.5, "mm")) | ||
|
||
figure_1B | ||
|
||
# Plot samples through depths using facets ------------------------------------ | ||
|
||
scales_y <- list( | ||
`EPI` = scale_y_continuous(trans = "reverse", limits = c(200, 0), breaks = c(0, 100, 200), expand = c(0,0)), | ||
`MES` = scale_y_continuous(trans = "reverse", limits = c(1000, 200), breaks = c(500, 1000), minor_breaks = c(750), expand = c(0,0)), | ||
`BAT` = scale_y_continuous(trans = "reverse", limits = c(4000, 1000), breaks = c(2000, 4000), expand = c(0,0)), | ||
`ABY` = scale_y_continuous(trans = "reverse", limits = c(6000, 4000), breaks = c(5000, 6000), expand = c(0,0)) | ||
) | ||
|
||
figure_1B = ggplot(figure_1B_table) + | ||
geom_image(aes(x = longitude, y = depth_num, image = image, size = size), asp = 12.3) + | ||
facet_grid_sc(depth_layer~., scales = list(y = scales_y)) + | ||
scale_size_manual(values = c(0.01, 0.0166)) + | ||
scale_x_continuous(limits = c(-180, 180), expand = c(0,0)) + | ||
ylab('Depth (m)') + | ||
xlab('Longitude (°)') + | ||
coord_cartesian(clip = "off") + | ||
theme_bw() + | ||
theme_cell + | ||
theme(rect = element_blank(), | ||
text = element_text(size = unit(6, "pt")), | ||
plot.margin = margin(1, 0, 1, 0, "mm"), | ||
legend.position = "none", | ||
axis.text.y = element_text(size = unit(6, "pt"), hjust = 1), | ||
axis.text.x = element_text(size = unit(6, "pt")), | ||
axis.title.y = element_text(size = unit(6, "pt")), | ||
axis.title.x = element_text(size = unit(6, "pt")), | ||
axis.ticks.length = unit(0.5, "mm"), | ||
strip.background = element_rect(colour = NA, fill = "lightgrey"), | ||
panel.spacing.y = unit(0, 'mm'), | ||
strip.text = element_text(size = 6, colour = "white", face = "bold", margin = margin(0.5, 0.5, 0.5, 0.5, "mm"))) | ||
|
||
figure_1B | ||
|
||
# Save figure ------- | ||
|
||
ggsave(paste0(figures_path_proj, "Figure-1/Figure-1B.pdf"), figure_1B, width = 113, height = 41.5, units = col_unit) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# Biosynthetic potential of the global ocean microbiome ================================== | ||
# Code used to produce the figures and analysis of the paper | ||
# Extended Data Fig. 2A - Abundance correlation binning ================================== | ||
|
||
# Libraries ------------------------------------------------------------------------------ | ||
|
||
rm(list = ls()) | ||
library(tidyverse) | ||
for (src_file in list.files('code/analysis/R')){ | ||
print(paste('Sourcing', src_file, '...')) | ||
source(paste('code/analysis/R', src_file, sep = '/'))} | ||
source('code/analysis/lib/Cell_Press_guidelines.R') | ||
source('/Users/paolil/polybox/PhD/Exploratorium/sushilab-colors/palettes-paoli.R') | ||
|
||
# Load data ------------------------------------------------------------------------------ | ||
|
||
general_metadata = load_general_metadata() | ||
|
||
samples_list = rbind(read_tsv("data/raw/go_microbiomics/samples/GEOTRACES.rand10-5-5.samples", col_names = "Internal Sample Name") %>% mutate(backmapping = "n=610", description = "Biogeotraces-HOTS-BATS"), | ||
read_tsv("data/raw/go_microbiomics/samples/TARA_OCEANS_prok.rand10.samples", col_names = "Internal Sample Name") %>% mutate(backmapping = "n=180", description = "Tara Oceans\n(Prok)"), | ||
read_tsv("data/raw/go_microbiomics/samples/TARA_OCEANS_vir.rand10.samples", col_names = "Internal Sample Name") %>% mutate(backmapping = "n=190", description = "Tara Oceans\n(Viral)"), | ||
read_tsv("data/raw/go_microbiomics/samples/MALASPINA.rand5.samples", col_names = "Internal Sample Name") %>% mutate(backmapping = "n=58", description = "Malaspina")) %>% | ||
mutate(`Internal Sample Name` = gsub("_METAG", "", `Internal Sample Name`)) %>% | ||
left_join(general_metadata %>% select(`Internal Sample Name`, size_fraction)) | ||
|
||
summary_table = load_raw_summary() | ||
|
||
without_diffcov = read_tsv("data/raw/go_microbiomics/summaries/diffcov_go_micro_metabat2.c2k.e500_cpl50_ctn10-evaluate_summary.tsv") %>% | ||
mutate(Sample = gsub("_[^_]*$", "", `Bin Id`), | ||
`Internal Sample Name` = gsub('_METAG', '', basename(dirname(`Folder Location`))), | ||
Method = gsub('_a[0-9]+\\.', '', basename(`Folder Location`))) %>% | ||
add_genomes_quality(weight = 5) | ||
|
||
# Prepare data ------------------------------------------------------------------------------ | ||
|
||
sum(samples_list$`Internal Sample Name` %in% summary_table$`Internal Sample Name`) | ||
samples_list$`Internal Sample Name`[!(samples_list$`Internal Sample Name` %in% summary_table$`Internal Sample Name`)] | ||
|
||
col_subset = c("Internal Sample Name", "Folder Location", "Bin Id", "Mean Completeness", "Mean Contamination", "CheckM Completeness", "CheckM Contamination", "CheckM Strain heterogeneity", "CheckM N50 (scaffolds)", "Anvio Domain", "Anvio Domain Confidence", "Anvio Completion", "Anvio Redundancy", "Anvio # Scaffolds", "Anvio Length", "Folder Location", "Method", "Quality", "HighQ", "GoodQ", "MediumQ", "LowQ") | ||
|
||
figure_2B_table = | ||
rbind( | ||
left_join(samples_list, without_diffcov %>% select(all_of(col_subset))) %>% mutate(`Differential Coverage` = FALSE), | ||
left_join(samples_list, summary_table %>% select(all_of(col_subset))) %>% mutate(`Differential Coverage` = TRUE) | ||
) %>% | ||
group_by(`Internal Sample Name`, `Differential Coverage`) %>% | ||
summarize(backmapping = unique(backmapping), | ||
description = unique(description), | ||
`# MAGs` = n(), | ||
`# HighQ` = sum(HighQ), | ||
`# GoodQ` = sum(GoodQ), | ||
`# MediumQ` = sum(MediumQ), | ||
`# LowQ` = sum(LowQ), | ||
`Cumulative Q-score` = sum(Quality), | ||
`Cumulative Q'-score` = sum(`Mean Completeness` - 5*`Mean Contamination` + (`Mean Contamination`*(`CheckM Strain heterogeneity`/100)) + 0.5*(log10(`CheckM N50 (scaffolds)`))), | ||
`Cumulative Q'-checkm-score` = sum(`CheckM Completeness` - 5*`CheckM Contamination` + (`CheckM Contamination`*(`CheckM Strain heterogeneity`/100)) + 0.5*(log10(`CheckM N50 (scaffolds)`))), | ||
`Average Q-score` = mean(Quality), | ||
`Average Q'-score` = mean(`Mean Completeness` - 5*`Mean Contamination` + (`Mean Contamination`*(`CheckM Strain heterogeneity`/100)) + 0.5*(log10(`CheckM N50 (scaffolds)`))), | ||
`Average Q'-checkm-score` = mean(`CheckM Completeness` - 5*`CheckM Contamination` + (`CheckM Contamination`*(`CheckM Strain heterogeneity`/100)) + 0.5*(log10(`CheckM N50 (scaffolds)`))) | ||
) %>% | ||
replace(is.na(.), 0) %>% | ||
left_join(load_general_metadata()) %>% | ||
group_by(Sample) %>% | ||
summarize(dataset = unique(dataset), | ||
backmapping = unique(backmapping), | ||
description = unique(description), | ||
size_fraction = unique(size_fraction), | ||
`# MAGs ratio` = `# MAGs`[`Differential Coverage`]/`# MAGs`[!`Differential Coverage`], | ||
`Cumulative Q-score ratio` = `Cumulative Q-score`[`Differential Coverage`]/`Cumulative Q-score`[!`Differential Coverage`], | ||
`Cumulative Q'-score ratio` = `Cumulative Q'-score`[`Differential Coverage`]/`Cumulative Q'-score`[!`Differential Coverage`], | ||
`Cumulative Q'-checkm-score ratio` = `Cumulative Q'-checkm-score`[`Differential Coverage`]/`Cumulative Q'-checkm-score`[!`Differential Coverage`], | ||
`Average Q-score ratio` = `Average Q-score`[`Differential Coverage`]/`Average Q-score`[!`Differential Coverage`], | ||
`Average Q'-score ratio` = `Average Q'-score`[`Differential Coverage`]/`Average Q'-score`[!`Differential Coverage`], | ||
`Average Q'-checkm-score ratio` = `Average Q'-checkm-score`[`Differential Coverage`]/`Average Q'-checkm-score`[!`Differential Coverage`]) %>% | ||
replace(is.na(.), 1) | ||
|
||
# Plot figure ---------------------------------------------------------------------------- | ||
|
||
fraction_dict = c("<-0.22" = " (Viral)", | ||
"0.1-0.22" = " (Viral)", | ||
"0.22-0.45" = " (F.L.1)", | ||
"0.45-0.8" = " (F.L.1)", | ||
"0.22-1.6" = " (F.L.2)", | ||
"0.22-3" = " (F.L.2)", | ||
"0.2-0.8" = " (F.L.1)", | ||
"0.8-20" = " (P.A.)") | ||
|
||
plot_order = c("n=190, Tara Oceans (Viral)", "n=190, Tara Oceans (F.L.1)", "n=180, Tara Oceans (F.L.2)", "n=58, Malaspina (F.L.1)", "n=58, Malaspina (P.A.)", "n=610, Biogeotraces", "n=610, Hawaiian Ocean Time-Series", "n=610, Bermuda-Atlantic Time-Series") | ||
|
||
figure_2B_table %>% | ||
#filter(`Cumulative Q'-score ratio` != Inf) %>% | ||
summary() | ||
figure_2B_table %>% | ||
filter(`Cumulative Q'-score ratio` != Inf) %>% | ||
summary() | ||
|
||
figure_2B = figure_2B_table %>% | ||
mutate(fraction = ifelse(grepl("Tara|Mala", dataset), fraction_dict[size_fraction], "")) %>% | ||
mutate(facet = factor(paste0(backmapping, ", ", dataset, fraction), levels = plot_order)) %>% | ||
mutate(dot_type = ifelse(`Cumulative Q'-score ratio` %in% c(Inf, 1), "Special Case", "Normal")) %>% | ||
ggplot() + | ||
geom_boxplot(aes(x = facet, y = `Cumulative Q'-score ratio`, color = dataset), size = 0.3, outlier.shape = NA, fill = NA, position = position_dodge(width = 1)) + | ||
geom_point(aes(x = facet, y = `Cumulative Q'-score ratio`, color = dataset, fill = dataset), size = .5, shape = 21, position = position_jitterdodge(jitter.width = 0.1, jitter.height = 0)) + | ||
#geom_point(aes(x = facet, y = mean(`# MAGs ratio`)), shape = 3, size = 3) + | ||
geom_hline(yintercept = 1, linetype = "dashed", size = 0.2) + | ||
scale_color_manual(values = dataset_colors[unique(figure_2B_table$dataset)]) + | ||
scale_fill_manual(values = alpha(dataset_colors[unique(figure_2B_table$dataset)], alpha = 0.6)) + | ||
facet_grid(.~facet, scales = "free_x", space = "free_x") + | ||
coord_cartesian(clip = "off") + | ||
scale_y_log10() + | ||
theme_bw() + | ||
theme_cell + | ||
theme(plot.margin = margin(0,0,0,0, 'mm'), | ||
legend.position = "none", | ||
rect = element_rect(size = NA), | ||
axis.title.x = element_blank(), | ||
axis.text.x = element_blank(), | ||
axis.ticks.x = element_blank(), | ||
axis.title.y = element_text(size = 6), | ||
axis.text.y = element_text(size = 6), | ||
panel.spacing.x = unit(0, 'mm'), | ||
strip.background = element_rect(color = "lightgrey", fill = "lightgrey"), | ||
strip.text = element_text(size = 6, colour = "white", face = "bold", margin = margin(0.5, 1, 0.5, 1, "mm"))) | ||
|
||
figure_2B | ||
|
||
ggsave(paste0(figures_path_proj, "Figure-2/Figure-2B.pdf"), figure_2B, width = 48, height = 25, units = "mm") | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Biosynthetic potential of the global ocean microbiome ================================== | ||
# Code used to produce the figures and analysis of the paper | ||
# Extended Data Fig. 2B - Binning MGEs =================================================== | ||
|
||
# Libraries ------------------------------------------------------------------------------ | ||
|
||
rm(list = ls()) | ||
library(viridis) | ||
library(tidyverse) | ||
library(patchwork) | ||
library(googlesheets4) | ||
for (src_file in list.files('code/analysis/R')){ | ||
print(paste('Sourcing', src_file, '...')) | ||
source(paste('code/analysis/R', src_file, sep = '/'))} | ||
source('code/analysis/lib/Cell_Press_guidelines.R') | ||
source('/Users/paolil/polybox/PhD/Exploratorium/sushilab-colors/palettes-paoli.R') | ||
|
||
# Load data ============================================================================== | ||
|
||
data_tbl = read_sheet('1YqvL7CDaW4fkQl9R9ZAsBKCuFtBCrJ0XV1rjLHA_x4s') %>% | ||
mutate(`Scaffold Length` = factor(`Scaffold Length`, levels = c("≥10Kb", "≥2Kb", "≥1Kb")), | ||
`Binning Result` = factor(`Binning Result`, levels = rev(c("MAG", "Bin", "Unbinned"))), | ||
`Genetic Element` = factor(`Genetic Element`, levels = c("All", "Chromosome", "Plasmid", "Virus", "Unannotated"))) | ||
|
||
vir_init = viridis(length(unique(data_tbl$`Binning Result`)), begin = .2) | ||
vir_cols = vir_init[1:(length(vir_init) - 1)] | ||
vir_grey = c(vir_cols, DescTools::ColToGrey(vir_init[length(vir_init)])) # Las color as greyscale | ||
|
||
|
||
p1 = data_tbl %>% | ||
ggplot() + | ||
geom_bar(aes(x = `Scaffold Length`, y = `Number of scaffolds`), stat = 'identity') + | ||
facet_wrap(~`Genetic Element`, nrow = 1, scales = "free_y") + | ||
theme_bw() + | ||
theme_cell + | ||
theme(rect = element_blank(), | ||
plot.margin = margin(1, 1, 1, 1, "mm"), | ||
axis.title.x = element_blank(), | ||
axis.text.x = element_blank(), | ||
axis.ticks.x = element_blank(), | ||
strip.background = element_rect(color = "lightgrey", fill = "lightgrey"), | ||
strip.text = element_text(size = 6, colour = "white", face = "bold", margin = margin(1, 1, 1, 1, "mm"))) | ||
|
||
p2 = data_tbl %>% | ||
ggplot() + | ||
geom_bar(aes(x = `Scaffold Length`, y = `Perc. of scaffolds`, fill = `Binning Result`), stat = 'identity') + | ||
facet_wrap(~`Genetic Element`, nrow = 1, scales = "free_y") + | ||
scale_fill_manual(values = rev(vir_grey)) + | ||
theme_bw() + | ||
theme_cell + | ||
theme(rect = element_blank(), | ||
plot.margin = margin(1, 1, 1, 1, "mm"), | ||
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), | ||
strip.background = element_rect(color = "lightgrey", fill = "lightgrey"), | ||
strip.text = element_text(size = 6, colour = "white", face = "bold", margin = margin(1, 1, 1, 1, "mm"))) | ||
|
||
|
||
p1 / p2 | ||
|
||
ggsave(paste0(figures_path_proj, "Figure-SX/Figure-SX-binning_MGEs.pdf"), width = two_col, height = 100, units = col_unit, device = cairo_pdf) |
Oops, something went wrong.