# Load libraries

In [None]:
library(tidyverse)
library(phyloseq)
library(decontam)
library(gridExtra)
library(ggrepel) 
library(cowplot)
library(vegan)

In [None]:
options(repr.plot.width=20, repr.plot.height=15)

# Functions

In [None]:
relevel_metadata <- function(phyloseq) {
    metadata <- sample_data(phyloseq)
    metadata_new <- metadata %>%
        data.frame(.) %>%
        mutate(
        timepoint = factor(timepoint, levels = c(0, 28, 90, 180)),
        treatment = relevel(factor(treatment), ref = "healthy_control")
        )

    sample_data(phyloseq) <- metadata_new
    return(phyloseq)
}

In [None]:
remove_contaminants <- function(phyloseq, negative_controls) {
    sample_data(phyloseq)$is_control <- sample_data(phyloseq)$treatment == negative_controls
    contamdf <- isContaminant(phyloseq, method="prevalence", neg="is_control")
    phyloseq_clean <- prune_taxa(!contamdf$contaminant, phyloseq)
    
    return(phyloseq_clean)
}

In [None]:
validate_positive_controls <- function(phyloseq_obj, positive_group) {
  # Subset positive control samples
  positive_controls <- prune_samples(sample_data(phyloseq_obj)$treatment == positive_group, phyloseq_obj)
  
  # Aggregate taxa at Genus level
  taxa_with_abundance <- tax_glom(positive_controls, taxrank = "Genus")
  taxa_abundances <- psmelt(taxa_with_abundance)
  
  # Get top 10 genera by total abundance
  top_taxa <- taxa_abundances %>%
    group_by(Genus) %>%
    summarise(TotalAbundance = sum(Abundance)) %>%
    arrange(desc(TotalAbundance)) %>%
    top_n(10) %>%
    pull(Genus)
  
  # Filter abundances for top 10 genera and reshape data
  abundances_per_sample <- taxa_abundances %>%
    filter(Genus %in% top_taxa) %>%
    select(Sample, Genus, Abundance) %>%
    pivot_wider(names_from = Genus, values_from = Abundance, values_fill = 0)
  
  # Return the abundances per sample
  return(abundances_per_sample)
}


In [None]:
remove_outliers <- function(phyloseq, outliers) {
    # Remove the specified outliers
    phyloseq_filtered <- prune_samples(!(sample_names(phyloseq) %in% outliers), phyloseq)
    
    # Prune any taxa that now have zero counts
    phyloseq_filtered <- prune_taxa(taxa_sums(phyloseq_filtered) > 0, phyloseq_filtered)
    
    return(phyloseq_filtered)
}


In [None]:
clean_phyloseq <- function(phyloseq) {
    # Removes technical controls
    phyloseq <- subset_samples(phyloseq, treatment %in% c("Dupilumab_treatment", "CRS_control", "healthy_control"))
    
    # Converts sample data to a data frame, removes unnecessary columns, and updates the phyloseq object
    sample_data_df <- as(sample_data(phyloseq), "data.frame")
    sample_data_df <- sample_data_df %>%
        select(-is_control)
    sample_data(phyloseq) <- sample_data(sample_data_df)
    
    return(phyloseq)
}

In [None]:
plot_rarecurve <- function(phylsoeq_object) {
    # Convert to matrix, ensure numeric, and remove any problematic values
    otu_mat <- as.matrix(otu_table(phylsoeq_object))
    otu_mat <- matrix(as.numeric(otu_mat), nrow=nrow(otu_mat))
    # Remove any rows with zero sums
    otu_mat <- otu_mat[rowSums(otu_mat) > 0, ]
    # Create rarefaction curves
    rarecurve(otu_mat, step=50, cex=0.5, xlim=c(0,500000))

}

In [None]:
rarefy_phyloseq_multiple <- function(physeq_obj, n_iterations = 100, sample_size = NULL, rngseed = 42, replace = FALSE, trimOTUs = TRUE, verbose = FALSE) {
  if (is.null(sample_size)) {
    sample_size <- min(sample_sums(physeq_obj))
  }
  
  set.seed(rngseed)  # Set initial seed
  seeds <- sample.int(1e6, n_iterations)  # Generate different seeds
  
  rarefied_list <- lapply(seeds, function(seed) {
    rarefy_even_depth(physeq = physeq_obj,
                     sample.size = sample_size,
                     rngseed = seed,
                     replace = replace,
                     trimOTUs = FALSE,
                     verbose = verbose)
  })  # removed simplify = FALSE as it's not needed in lapply
  
  # Merge the rarefied OTU tables
  merged_otu <- Reduce('+', lapply(rarefied_list, function(x) otu_table(x)))
  
  # Average and round the merged OTU table
  avg_otu <- round(merged_otu / n_iterations)
  
  # Create a new phyloseq object with the averaged OTU table
  avg_physeq <- phyloseq(otu_table(avg_otu, taxa_are_rows = taxa_are_rows(physeq_obj)),
                        sample_data(physeq_obj),
                        tax_table(physeq_obj),
                        refseq(physeq_obj))
  
  # Optionally trim OTUs
  if (trimOTUs) {
    avg_physeq <- prune_taxa(taxa_sums(avg_physeq) > 0, avg_physeq)
  }
  
  return(avg_physeq)
}

In [None]:
remove_low_depth_samples <- function(physeq, min_depth) {
  # Calculate sequencing depth for each sample
  sample_depth <- sample_sums(physeq)
  
  # Identify samples to keep (those with depth >= min_depth)
  samples_to_keep <- names(sample_depth[sample_depth >= min_depth])
  
  # Subset the phyloseq object to keep only those samples
  physeq_filtered <- prune_samples(samples_to_keep, physeq)
  
  # Print some information about the filtering
  cat("Original number of samples:", nsamples(physeq), "\n")
  cat("Number of samples after filtering:", nsamples(physeq_filtered), "\n")
  cat("Number of samples removed:", nsamples(physeq) - nsamples(physeq_filtered), "\n")
  
  return(physeq_filtered)
}

# Nasal samples

## Data cleanup

In [None]:
nasal_samples <- readRDS("../../results/Bacteria_BK628-01/Dada2/04_phyloseq/Bacteria_BK628-01_phyloseq.rds")
nasal_samples

In [None]:
nasal_samples %>%
	sample_data() %>%
	data.frame() %>%
	group_by(treatment, timepoint) %>%
	count()

In [None]:
nasal_samples <- relevel_metadata(nasal_samples)
nasal_samples

In [None]:
nasal_samples <- remove_contaminants(nasal_samples, "Negative_control")
nasal_samples

In [None]:
validate_positive_controls(nasal_samples, "Positive_control")

In [None]:
nasal_outliers <- c(
	"Bacteria_BK628-01M0138",
	"Bacteria_BK628-01M0133",
	"Bacteria_BK628-01M0103",
	"Bacteria_BK628-01M0032"
)

In [None]:
nasal_samples <- remove_outliers(nasal_samples, nasal_outliers)
nasal_samples

In [None]:
nasal_samples_clean <- clean_phyloseq(nasal_samples)
nasal_samples_clean

## Transform data

In [None]:
sample_data_df <- data.frame(sample_data(nasal_samples_clean))

# Calculate sequencing depth (total reads per sample)
sample_data_df$SequencingDepth <- sample_sums(nasal_samples_clean)

# Perform Kruskal-Wallis test
kruskal_test <- kruskal.test(SequencingDepth ~ treatment, data = sample_data_df)
p_value <- kruskal_test$p.value

# Create the plot
ggplot(sample_data_df, aes(x = treatment, y = SequencingDepth, fill = treatment)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme_bw() +
  labs(x = "Treatment Group", y = "Sequencing Depth") +
  ggtitle("Sequencing Depth by Treatment Group") 

# If the Kruskal-Wallis test is significant, perform post-hoc pairwise comparisons
if (p_value < 0.05) {
  pairwise_test <- pairwise.wilcox.test(sample_data_df$SequencingDepth, 
                                        sample_data_df$treatment, 
                                        p.adjust.method = "bonferroni")
  print(kruskal_test)
  print(pairwise_test)
}

In [None]:
plot_rarecurve(nasal_samples_clean)

In [None]:
cutoff_nasal <- 100000

In [None]:
nasal_phylo <- remove_low_depth_samples(nasal_samples_clean, cutoff_nasal)

In [None]:
saveRDS(nasal_phylo, "../../results/Microbiome_analysis/nasal_samples_clean_raw.rds")
system("Rscript ../scripts/parallel_rarefy_script.R --input ../../results/Microbiome_analysis/nasal_samples_clean_raw.rds --output ../../results/Microbiome_analysis/nasal_samples_clean_rarefied.rds --iterations 100 --sample_size 100000 --cores 12 --verbose")

In [None]:
nasal_phylo_rarefied <- readRDS("../../results/Microbiome_analysis/nasal_samples_clean_rarefied.rds")

In [None]:
nasal_phylo_clr <- microbiome::transform(nasal_phylo, "clr")
nasal_phylo_comp <- microbiome::transform(nasal_phylo, "compositional")

In [None]:
nasal_phylo %>%
	sample_data() %>%
	data.frame() %>%
	group_by(treatment, timepoint) %>%
	count()

In [None]:
saveRDS(nasal_phylo, "../../results/Microbiome_analysis/nasal_samples_clean_raw.rds")
saveRDS(nasal_phylo_rarefied, "../../results/Microbiome_analysis/nasal_samples_clean_rarefied.rds")
saveRDS(nasal_phylo_clr, "../../results/Microbiome_analysis/nasal_samples_clean_clr.rds")
saveRDS(nasal_phylo_comp, "../../results/Microbiome_analysis/nasal_samples_clean_comp.rds")

# Gut samples

## Data cleanup

In [None]:
gut_samples <- readRDS("../../results/Bacteria_BK629-01/Dada2/04_phyloseq/Bacteria_BK629-01_phyloseq.rds")
gut_samples

In [None]:
gut_samples %>%
	sample_data() %>%
	data.frame() %>%
	group_by(treatment, timepoint) %>%
	count()

In [None]:
gut_samples <- relevel_metadata(gut_samples)
gut_samples

In [None]:
gut_samples <- remove_contaminants(gut_samples, "Negative_control")
gut_samples

In [None]:
validate_positive_controls(gut_samples, "Positive_control")

In [None]:
gut_outliers <- c(
	"Bacteria_BK629-01M0116",
	"Bacteria_BK629-01M0119",
	"Bacteria_BK629-01M0126",
	"Bacteria_BK629-01M0037",
	"Bacteria_BK629-01M0039",
	"Bacteria_BK629-01M0040"
)

In [None]:
gut_samples <- remove_outliers(gut_samples, gut_outliers)
gut_samples

In [None]:
gut_samples_clean <- clean_phyloseq(gut_samples)
gut_samples_clean

## Transform data

In [None]:
sample_data_df <- data.frame(sample_data(gut_samples_clean))

# Calculate sequencing depth (total reads per sample)
sample_data_df$SequencingDepth <- sample_sums(gut_samples_clean)

# Perform Kruskal-Wallis test
kruskal_test <- kruskal.test(SequencingDepth ~ treatment, data = sample_data_df)
p_value <- kruskal_test$p.value

# Create the plot
ggplot(sample_data_df, aes(x = treatment, y = SequencingDepth, fill = treatment)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme_bw() +
  labs(x = "Treatment Group", y = "Sequencing Depth") +
  ggtitle("Sequencing Depth by Treatment Group") 

# If the Kruskal-Wallis test is significant, perform post-hoc pairwise comparisons
pairwise_test <- pairwise.wilcox.test(
	sample_data_df$SequencingDepth, 
	sample_data_df$treatment, 
	p.adjust.method = "bonferroni"
)

print(kruskal_test)
print(pairwise_test)

In [None]:
plot_rarecurve(gut_samples_clean)

In [None]:
cutoff_gut <- 250000

In [None]:
gut_phylo <- remove_low_depth_samples(gut_samples_clean, cutoff_gut)

In [None]:
saveRDS(gut_phylo, "../../results/Microbiome_analysis/gut_samples_clean_raw.rds")
system("Rscript ../scripts/parallel_rarefy_script.R --input ../../results/Microbiome_analysis/gut_samples_clean_raw.rds --output ../../results/Microbiome_analysis/gut_samples_clean_rarefied.rds --iterations 100 --sample_size 100000 --cores 12 --verbose")

In [None]:
gut_phylo_rarefied <- readRDS("../../results/Microbiome_analysis/gut_samples_clean_rarefied.rds")

In [None]:
gut_phylo_clr <- microbiome::transform(gut_phylo, "clr")
gut_phylo_comp <- microbiome::transform(gut_phylo, "compositional")

In [None]:
gut_phylo %>%
	sample_data() %>%
	data.frame() %>%
	group_by(treatment, timepoint) %>%
	count()

In [None]:
saveRDS(gut_phylo, "../../results/Microbiome_analysis/gut_samples_clean_raw.rds")
saveRDS(gut_phylo_rarefied, "../../results/Microbiome_analysis/gut_samples_clean_rarefied.rds")
saveRDS(gut_phylo_clr, "../../results/Microbiome_analysis/gut_samples_clean_clr.rds")
saveRDS(gut_phylo_comp, "../../results/Microbiome_analysis/gut_samples_clean_comp.rds")

# Checking new the presence of the new Lactobacillus genera

In [None]:
nasal_phylo <- readRDS("../../results/Microbiome_analysis/nasal_samples_clean_raw.rds")
nasal_phylo

In [None]:
gut_phylo <- readRDS("../../results/Microbiome_analysis/gut_samples_clean_raw.rds")
gut_phylo

In [None]:
new_lactobacillus_genera <- c(
	"Lactobacillus",
	"Paralactobacillus",
	"Holzapfelia",
	"Amylolactobacillus",
	"Bombilactobacillus",
	"Companilactobacillus",
	"Lapidilactobacillus",
	"Agrilactobacillus",
	"Schleiferilactobacillus",
	"Loigolactobacillus",
	"Lacticaseibacillus",
	"Latilactobacillus",
	"Dellaglioa",
	"Liquorilactobacillus",
	"Ligilactobacillus",
	"Lactiplantibacillus",
	"Furfurilactobacillus",
	"Paucilactobacillus",
	"Limosilactobacillus",
	"Fructilactobacillus",
	"Acetilactobacillus",
	"Apilactobacillus",
	"Levilactobacillus",
	"Secundilactobacillus",
	"Lentilactobacillus"
)

In [None]:
tax_table_nasal <- tax_table(nasal_phylo) %>%
  as.data.frame() %>%
  filter(Genus %in% new_lactobacillus_genera) %>%
  count(Genus, name = "n_ASVs_nasal") %>%
  arrange(desc(n_ASVs_nasal))

In [None]:
tax_table_gut <- tax_table(gut_phylo) %>%
  as.data.frame() %>%
  filter(Genus %in% new_lactobacillus_genera) %>%
  count(Genus, name = "n_ASVs_gut") %>%
  arrange(desc(n_ASVs_gut))

In [None]:
full_join(tax_table_nasal, tax_table_gut, by = "Genus") %>%
	write.csv("./new_lactobacillius_genera_counts.csv", row.names = FALSE)