In [None]:
library(tidyverse)
library(cowplot)
library(Matrix.utils)
library(edgeR)
library(Matrix)
library(reshape2)
library(S4Vectors)
library(SingleCellExperiment)
library(pheatmap)
library(apeglm)
library(png)
library(DESeq2)
library(RColorBrewer)
library(data.table)
library(Seurat)
library(AUCell)
library(ggplot2)
library(tidyr)
library(harmony)
library(Scillus)
library(ggpubr)
library(patchwork)
set.seed(123)

In [None]:
packageVersion("Seurat")

# Goals:
## 1. Create a Seurat object and perform basic QC
## 2. Normalize the data, find PCs, correct batch effects and cluster -> Annotate cell types - > Remove contaminant or DC2 cells if necessary

# <span style="color:green"> Part 1: Create Seur obj and QC <span>

## Read in files

In [None]:
files_fullpath <- list.files("doublet_free_matrix",full.names = T)
files <- list.files("doublet_free_matrix",full.names = F)
SubjectID <- files

files_fullpath
files
SubjectID

In [None]:
l_raw <- list()

for (i in 1:length(files_fullpath)){
    input = Read10X(files_fullpath[i])
    colnames(input) = paste(colnames(input),SubjectID[i], sep = ".")
    l_raw[[i]] = input
}

names(l_raw) <- SubjectID

str(l_raw)
names(l_raw)

In [None]:
fullmat <- do.call(cbind, l_raw)
dim(fullmat)
corner(fullmat)

## prep the metadata

In [None]:
head(colnames(fullmat))

In [None]:
names <- colnames(fullmat)
x <- strsplit(names,"[.]")
Sample <- sapply(x, "[", 2)
corner(Sample)
meta <- data.frame(Sample)
row.names(meta) <- names
meta$cell_id <- row.names(meta)
corner(meta)
dim(meta)

In [None]:
df <- data.frame(
    Sample = names(l_raw),
    SampleName = c("KO1", "KO2", "WT1", "WT2"))
head(df)

In [None]:
meta2 <- left_join(meta, df, by = "Sample")
row.names(meta2) <- meta2$cell_id
head(meta2)

## create Seurat Object

In [None]:
seur <- CreateSeuratObject(fullmat, meta.data = meta2, min.cells = 3, min.features = 200)

In [None]:
seur[["percent.mt"]] <- PercentageFeatureSet(seur, pattern = "^mt-")

## Inspect QC matrix

In [None]:
Idents(seur) <- "SampleName"
meta <- seur@meta.data
meta$log10_nFeature_RNA <- log(meta$nFeature_RNA, 10)
meta$log10_nCount_RNA <- log(meta$nCount_RNA, 10)
head(meta)

nfeat <- ggplot(meta, aes(x=SampleName, y=log10_nFeature_RNA, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ncount <- ggplot(meta, aes(x=SampleName, y=log10_nCount_RNA, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 

pct.mt <- ggplot(meta, aes(x=SampleName, y=percent.mt, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

In [None]:
options(repr.plot.width=10, repr.plot.height=5)
nfeat

In [None]:
ncount

In [None]:
pct.mt

## calculate median percentager mitocondria for each sample and filter accordingly

In [None]:
# first inspect median values of percentage mitrocondria in each sample

In [None]:
samples <- c("KO1", "KO2", "WT1", "WT2")
metadata <- seur[[]]
# Initialize an empty vector to store the medians
med_mt <- c()

# Loop through each sample and calculate the median percent.mt
for (i in samples) {
  # Subset the metadata based on the sample name
  data = metadata[metadata$SampleName == i, ]
  
  # Calculate the median of the 'percent.mt' column
  median_val = median(data$percent.mt)
  
  # Append the median to the med_mt vector
  med_mt = c(med_mt, median_val)
}
    

In [None]:
med_mt

#### filter out percent.mt > 6.5

In [None]:
seur <- subset(seur, subset = percent.mt < 6.5)
table(seur$SampleName)

In [None]:
seur$experimental_groups <- ifelse(seur$SampleName == "KO1" | seur$SampleName == "KO2", "KO_group", "WT_group")

In [None]:
table(seur$experimental_groups)

In [None]:
saveRDS(seur, "GEX_Allsamples_qc_filtered.RDS")

In [None]:
dim(seur)

# <span style="color:green"> Part 2-a: Normalize, scale, dimention reduction, batch correction, and cluster <span>

In [None]:
seur <- NormalizeData(seur) %>% FindVariableFeatures(selection.method = "vst") %>% ScaleData() %>% RunPCA(verbose = F)

### Correct batch effect with Harmony and run UMAP

In [None]:
seur <- RunHarmony(seur, group.by.vars = "SampleName", verbose = F) 

### Find clusters

In [None]:
seur <- FindNeighbors(seur, reduction = "harmony", dims = 1:50) %>% FindClusters(resolution = 0.2) %>% RunUMAP(reduction = "harmony", dims = 1:50)


In [None]:
options(repr.plot.width = 12, repr.plot.height = 12)
one <- DimPlot(seur, group.by = "SampleName")
two <- DimPlot(seur, group.by = "experimental_groups", cols = c("darkred", "grey"))
three <- DimPlot(seur, label = T)
four <- FeaturePlot(seur, features = "Carm1")
ggarrange(one, two, three, four)

# <span style="color:green"> Part 2-b: Cell type annotation <span>


In [None]:
signatures <- c("Tcf4", "Bst2",  "Ifitm2", "Ifitm3","Ifitm1","Ptprc", "Mki67","Top2a", "Xcr1", "Irf8", "Clec9a", "Itgae", "Batf3", "Itgam", "Sirpa",  "Cmss1","Cd209a", "Fcgr3", "Lyz2",  "Cd7", "Zbtb46", "Ccr7", "Irf4",  "Cd3d", "Trbc2", "Gzmb")


In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

DotPlot(seur, features = signatures, assay = "RNA", dot.scale = 18, cols = c("white","red")) + theme_classic(base_size = 20) + geom_point(aes(size = pct.exp),shape = 21, color = "grey",stroke = 0.5) + scale_colour_gradient2(low = "#4575b4", mid = "white", high = "#d73027") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


In [None]:
table(seur$seurat_clusters)

In [None]:
# 0, cDC1
# 1, cycling DC 
# 2, INF_DC?
# 3, activated cDC
# 4, cDC2  
# 5, T cells contaminant


In [None]:
saveRDS(seur, "GEX_Allsample_clustered.RDS")


## find all markers with two sided Wilcoxon rank-sum test 

In [None]:
All_markers_wlcx <- FindAllMarkers(seur, assay = "RNA", only.pos = TRUE,  min.pct = 0.25, logfc.threshold = 0.25)

In [None]:
# faster implementation for the wilcoxon rank test
# devtools::install_github('immunogenomics/presto')

In [None]:
# inspect top 10 markers
All_markers_wlcx %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

In [None]:
table(seur$SampleName)

# <span style="color:green"> Part 2-c: Remove T cell contaminants (cluster 5) <span>

In [None]:
table(Idents(seur))

In [None]:
metadata <- seur[[]]
contamID <- rownames(metadata[metadata$seurat_clusters == 5,])

In [None]:
head(contamID)
length(contamID)

In [None]:
seur <- subset(seur, cells = contamID, invert = TRUE) 

In [None]:
dim(seur)

In [None]:
saveRDS(seur, "GEX_CLEANEDsamples_qc_filtered")

# <span style="color:green"> Part 2-c (cont):Re-QC, normalize and cluster since some cells are being removed

## Inspect QC matrix 

In [None]:
seur <- readRDS("GEX_CLEANEDsamples_qc_filtered")

Idents(seur) <- "SampleName"
meta <- seur@meta.data
meta$log10_nFeature_RNA <- log(meta$nFeature_RNA, 10)
meta$log10_nCount_RNA <- log(meta$nCount_RNA, 10)
head(meta)

nfeat <- ggplot(meta, aes(x=SampleName, y=log10_nFeature_RNA, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ncount <- ggplot(meta, aes(x=SampleName, y=log10_nCount_RNA, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 

pct.mt <- ggplot(meta, aes(x=SampleName, y=percent.mt, fill=SampleName)) + 
  geom_violin(trim=FALSE, scale = 'width')+
  geom_boxplot(width=0.1, fill="white", outlier.size = 0.2) + theme_classic() + theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

In [None]:
options(repr.plot.width=15, repr.plot.height=10)
one <- nfeat
two <- ncount
three <- pct.mt
ggarrange(one, two, three)

In [None]:
min(seur$nFeature_RNA)

In [None]:
max(seur$percent.mt)

In [None]:
median(seur$percent.mt)

In [None]:
table(seur@meta.data$SampleName)

##  Normalize with LogNorm, scale and run PCA <span>

In [None]:
seur <- NormalizeData(seur) %>% FindVariableFeatures(selection.method = "vst") %>% ScaleData() %>% RunPCA(verbose = F)


## Correct batch effect with Harmony and run UMAP

In [None]:
seur <- RunHarmony(seur, group.by.vars = "SampleName", verbose = F) %>% RunUMAP(reduction = "harmony", dims = 1:40)


## Find clusters

In [None]:
seur <- FindNeighbors(seur, reduction = "harmony", dims = 1:40) %>% FindClusters(resolution = 0.1)


In [None]:
options(repr.plot.width = 12, repr.plot.height = 12)
one <- DimPlot(seur, group.by = "SampleName")
two <- DimPlot(seur, group.by = "experimental_groups", cols = c("darkred", "grey"))
three <- DimPlot(seur, label = T)
four <- FeaturePlot(seur, features = "Carm1")
ggarrange(one, two, three, four)

In [None]:
table(seur$experimental_groups)

# <span style="color:green"> Part 2-c (cont): Cell type annotation2 <span>

In [None]:
signatures <- c("Tcf4", "Bst2",  "Ifitm2", "Ifitm3","Ifitm1","Ptprc", "Mki67","Top2a", "Xcr1", "Irf8", "Clec9a", "Itgae", "Batf3", "Itgam", "Sirpa",  "Cmss1","Cd209a", "Fcgr3", "Lyz2",  "Cd7", "Zbtb46", "Ccr7")


In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

DotPlot(seur, features = signatures, assay = "RNA", dot.scale = 18, cols = c("white","red")) + theme_classic(base_size = 20) + geom_point(aes(size = pct.exp),shape = 21, color = "grey",stroke = 0.5) + scale_colour_gradient2(low = "#4575b4", mid = "white", high = "#d73027") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


In [None]:
table(seur$seurat_clusters)

In [None]:
# 0, cDC1
# 1, cycling DC1 
# 2, Ribosomal-high
# 3, cDC2
# 4, activated cDC1 


## find all markers with two-sided Wilcoxon rank-sum test 

In [None]:
All_markers_wlcx <- FindAllMarkers(seur, assay = "RNA", only.pos = TRUE,  min.pct = 0.25, logfc.threshold = 0.25)

In [None]:
# inspect top 10 markers
All_markers_wlcx %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

## Annotate cell types

In [None]:
celltype <- c("DC1", "cycling_DC1", "rRNA_hi_DC", "DC2", "activated_DC1")
names(celltype) <- levels(seur)

seur = RenameIdents(seur, celltype)

In [None]:
table(Idents(seur))

In [None]:
seur$celltype_annotation <- Idents(seur)

In [None]:
saveRDS(seur, "GEX_CLEANEDsamples_clustered.RDS")

# <span style="color:green">  Subset the Seurat object to DC1 because that's the focus of the study <span>

In [None]:
seur <- readRDS("GEX_CLEANEDsamples_clustered.RDS")

In [None]:
table(Idents(seur))

In [None]:
seur <- subset(seur, idents = c("DC1", "cycling_DC1", "activated_DC1"))


In [None]:
meta <- seur[[]]
meta$celltype <- "cDC1"
# for later splitting change the variable names in experimental group
meta$experimental_group <- ifelse(meta$experimental_group == "KO_group", "KO", "WT")

In [None]:
table(Idents(seur))

## Re normalize, correct batch effect, and find cluster in the DC1 seurat object 

In [None]:
seur <- NormalizeData(seur) %>% FindVariableFeatures(selection.method = "vst") %>% ScaleData() %>% RunPCA(verbose = F)


In [None]:
seur <- RunHarmony(seur, group.by.vars = "SampleName", verbose = F) %>% RunUMAP(reduction = "harmony", dims = 1:40)


In [None]:
seur <- FindNeighbors(seur, reduction = "harmony", dims = 1:40) %>% FindClusters(resolution = 0.1)


In [None]:
options(repr.plot.width = 12, repr.plot.height = 12)
one <- DimPlot(seur, group.by = "SampleName")
two <- DimPlot(seur, group.by = "experimental_groups", cols = c("darkred", "grey"))
three <- DimPlot(seur, label = T)
four <- DimPlot(seur, group.by = "celltype_annotation")
ggarrange(one, two, three, four)

## Cell type annotation in DC1 object

In [None]:
signatures <- c("Mki67","Irf4","Irf8", "Xcr1","Clec9a","Relb", "Ccr7", "Sirpa")



In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

DotPlot(seur, features = signatures, assay = "RNA", dot.scale = 12, cols = c("white","red")) + theme_classic(base_size = 12) + geom_point(aes(size = pct.exp),shape = 21, color = "grey",stroke = 0.5) + scale_colour_gradient2(low = "#4575b4", mid = "white", high = "#d73027") + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


In [None]:
clusterID <- c("DC1", "cycling_DC1", "activated_DC1")
names(clusterID) <- levels(seur)
seur <- RenameIdents(seur, clusterID)
seur$celltype_annotation <- Idents(seur)

In [None]:
table(Idents(seur))

In [None]:
options(repr.plot.width = 12, repr.plot.height = 12)
one <- DimPlot(seur, group.by = "SampleName")
two <- DimPlot(seur, group.by = "experimental_groups", cols = c("darkred", "grey"))
three <- DimPlot(seur)
ggarrange(one, two, three)

In [None]:
saveRDS(seur, "GEX_cDC1_clustered.RDS")