# Automatically annotate clusters

In [None]:
library(tidyverse)
library(glue)
library(magrittr)
library(Seurat)
library(glue)

subproject = params$subproject
patient = params$patient

# set path depending if on cluster or local
root_dir <- ifelse(
  grepl(dirname(getwd()), pattern = "/scratch/"),
  "/scratch/devel/pnieto", #true statement
  "S:" # false statement
)

# name of the project directory
proj_dir <- glue::glue("{root_dir}/projects/CSF")
# name of output folder
out_dir <- glue("{proj_dir}/data/{subproject}/output/04_high_res_annotation") %T>%
  dir.create()

# load cell type/list
source(glue("{root_dir}/scripts/r_utils/marker_genes.R"))

In [None]:
# load annotated object
data <- readRDS(glue("{proj_dir}/data/{subproject}/output/03_inferCNV/{patient}/{patient}_annotated_inferCNV.rds"))

In [None]:
# option to recluster to even higher resolution
data <- data %>%
  NormalizeData() %>%
  FindVariableFeatures(nfeatures = 3000) %>%
  ScaleData() %>%
  RunPCA() %>%
  RunUMAP(dims = 1:20) %>%
  FindNeighbors(dims = 1:20) %>%
  FindClusters(resolution = 2)
data$annot_HR <- data$seurat_clusters

In [None]:
# compute signature scores con UCell
data <- UCell::AddModuleScore_UCell(data, features = marker_genes, ncores = 4, name = "")

In [None]:
# get average of each signature per "annot_HR" cluster
avg <- data@meta.data[, c("annot_HR", make.names(names(marker_genes)))]
rownames(avg) <- NULL
averages <- aggregate(. ~ annot_HR, data = avg, FUN = mean)
# make 0 all values below 0.1
averages[averages < 0.01] <- 0
# Calculate column sums (except for the first column)
row_sums <- rowSums(averages[, -1])
# Selecting the columns you want to divide by the row sums (excludes the first column)
columns_to_divide <- names(averages)[-1]
# Divide each element in the selected columns by the corresponding row sum
averages[columns_to_divide] <- averages[columns_to_divide] / row_sums

In [None]:
# make long so that it can be used with ggplot
df_avg <- pivot_longer(averages, cols = colnames(averages)[colnames(averages) != "annot_HR"])

# plot averages as a heatmap heatmap
df_avg %>%
  # mutate(value = if_else(value < 0.1, 0, value)) %>%
ggplot(aes(x = annot_HR, y = name, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "red") +  # Set the color scale from white to redd
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
        text = element_text(size = 16))   +
  labs(title = "Average signature score",
       fill = "Score",
       caption = glue("{subproject} {patient}"))
ggsave(file = glue("{out_dir}/{subproject}_{patient}_heatmap.png"), height = 10, width = 10)

In [None]:
# remove column annot and assign as rownames
rownames(averages) <- averages$annot_HR
averages$annot_HR <- NULL

## assign to each cluster, the cell type with the highest jaccard index
highest_column_list <- c()# Initialize an empty list to store the results
# Loop through the rows (clusters) of the 'averages' matrix
for (row_name in rownames(averages)) {
  # Get the row values for the current cluster
  row_values <- averages[row_name, ]

  # Check if all values in the row are 0
  if (all(row_values < 0.1)) {
    highest_column <- "unknown"
  } else {
    # Find the column names with the highest value(s) in the row
    highest_columns <- names(row_values)[row_values == max(row_values)]

    # Check if there is a tie (more than one column with the same highest value)
    if (length(highest_columns) > 1) {
      print(highest_columns)
      highest_column <- "unknown"
    } else {
      # Assign the highest_column to the current cluster in the list
      highest_column <- highest_columns
    }
  }

  # Assign the highest_column to the current cluster in the list
  highest_column_list <- c(highest_column_list, str_replace_all(highest_column, pattern = "\\.", replacement = " "))
}
names(highest_column_list) <- rownames(averages)

In [None]:
# assign annotation to annot 2
data$barcode <- rownames(data@meta.data)
df_meta <- data@meta.data[, c("barcode", "annot_HR")]
df_annot <- data.frame(
  "annot_HR" = names(highest_column_list),
  "auto_annot" = unname(highest_column_list)
)
df_meta <- merge(df_meta, df_annot, all.x = TRUE)
rownames(df_meta) <- df_meta$barcode
data <- AddMetaData(data, metadata = df_meta[,c("barcode", "auto_annot")])
# fix tumor clusters if any
data$auto_annot[data$annot %in% grep(unique(data$annot), pattern = "Tumor|tumor", value = TRUE)] <- "Tumor cells"
data$auto_annot[data$annot %in% grep(unique(data$annot), pattern = "Other|other", value = TRUE)] <- "Other cells"

In [None]:
# save annotated object
saveRDS(data, glue("{out_dir}/{subproject}_{patient}_HR_automatic_annotated.rds"))

In [None]:
p1 <- DimPlot(data, group.by = "annot_HR", cols = as.vector(pals::polychrome())) +
  labs(
    title = "Clustering"
  )
p2 <- DimPlot(data, group.by = "auto_annot", cols = pals::glasbey()) +
  labs(
    title = "High Resolution Annotation",
    caption = glue("{subproject} {patient}")
  )
p1+p2
ggsave(file = glue("{out_dir}/{subproject}_{patient}_umaps.png"), width = 12)

# Cell Typist

In [None]:
# create CellTypist folder
ct_dir <- glue("{proj_dir}/data/{subproject}/output/05_CellTypist") %T>%
  dir.create()
# save csv for CellTypist
write.table(as.matrix(GetAssayData(object = data, slot = "counts")),
            glue("{ct_dir}/{patient}_raw_counts.csv"),
            sep = ',', row.names = T, col.names = T, quote = F)
# run CT script
system(glue("python {proj_dir}/code/run_celltypist.py {ct_dir}/{patient}_raw_counts.csv {ct_dir} {patient}_"))

In [None]:
predictions <- read.csv(glue("{ct_dir}/{patient}_predicted_labels.csv"))
rownames(predictions) <- predictions$X
predictions$X <- NULL
colnames(predictions) <- paste0("CT_", colnames(predictions))

data <- AddMetaData(data, metadata = predictions)
saveRDS(data, glue("{ct_dir}/{patient}_HR_automatic_annotated_CT.rds"))

In [None]:
palette <- c(as.vector(pals::polychrome()), as.vector(pals::alphabet()))

t <- table(data$CT_predicted_labels) %>%
  as.data.frame() %>%
  filter(Freq >= 5) %>%
  pull(Var1)

DimPlot(subset(data, CT_predicted_labels %in% t), group.by = "CT_predicted_labels", ncol = 2, cols = palette, shuffle = T) &
  theme(text = element_text(size = 14)) &
  guides(colour = guide_legend(ncol = 2, override.aes = list(size=3)))
ggsave(glue("{ct_dir}/{patient}_CT_predictions.png"), width = 16)

<a href="#top">Back to top</a>

****