In [None]:
library(Signac)
library(Seurat)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
library(readr)
library(stringr)
library(dplyr)
set.seed(1234)

#### Send to channel code
library(parallel)
library(ggpubr)
library(car)
library(qvalue)

suppressMessages(library(chromVAR))
suppressMessages(library(motifmatchr))
suppressMessages(library(SummarizedExperiment))

library(pheatmap)
library(RColorBrewer)
library(beeswarm)

In [None]:
in_dir <- '/nfs/lab/parulk/HPAP_scATAC/motif_analysis/'

#Load in devscores, info, motif
motifdata <- read.table(file=paste0(in_dir,'motifdata.txt'), sep="\t")
info <- read.table(file=paste0(in_dir,'info_table.txt'), sep="\t")
TFClass_Lookup <- read_csv("/nfs/lab/welison/References/220907_WE_Chromvar_to_Gene_By_Subfam_Complete(JAPRAR2022_TFClass).csv")
TFClass_Full <- read_csv("/nfs/lab/welison/References/220907_WE_Chromvar_to_Gene_Jaspar2022.csv")
variability <- read.table(file=paste0(in_dir,'variability.txt'), sep="\t")

head(motifdata)
head(info)
dim(TFClass_Lookup)
head(TFClass_Lookup)

head(variability)
dim(variability)

In [None]:
#Devscores are slow to load, so this is a faster but awkward workaround
dev_file <- paste0(in_dir,'devscores.txt')

devscores <- vroom::vroom(file=dev_file, skip=1, col_names=FALSE)
devscores <- tibble::column_to_rownames(devscores, var="X1")
colnames(devscores) <- str_split(readLines(file(dev_file),n=1), " ")[[1]]

head(devscores)

In [None]:
# Originial deviaiton score matrix, may not be needed now that I am loading in variability directly
dev <- readRDS("/nfs/lab/parulk/HPAP_scATAC/motif_analysis/ChromVAR_Object.RDS")
jaspar.motifs <- readRDS(file ='/nfs/lab/welison/multiome/chromvar/jaspar_2022_object.Rdata')

dev

In [None]:
# Filter by variability

ggplot(variability, aes(x=variability)) +
geom_histogram() + geom_vline(xintercept = 1.2) + geom_vline(xintercept = 1.6)

sum(variability$variability > 1.2)
length(variability$variability > 1.2)

sum(variability$p_value < 0.05)
sum(variability$p_value_adj < 0.05)

sum(variability$p_value_adj < 0.05 & variability$variability > 1.2)

In [None]:
# Subset deviation scores by variability
variable_to_keep <- rownames(filter(variability, variability > 1.2))

var_subset_devscores <- devscores[variable_to_keep,]
dim(var_subset_devscores)
head(var_subset_devscores)

mat_type = sapply(unique(info$cluster), 
                function(i) rowMeans(var_subset_devscores[info$cells[info$cluster==i]], na.rm=TRUE))
head(mat_type)
dim(mat_type)

mat_df <- as.data.frame(mat_type)

mat_df_high_dev <- mat_df[apply(abs(mat_df) > 5, 1, na.rm=TRUE, any),]

dim(mat_df_high_dev)
head(mat_df_high_dev, n=10)

# Some cell types don't have enough cells for this to be meaningful
table(info$cluster)

#Remove cell types with less then 50 cells
mat_df_filt <-select(mat_df)

colnames(motifdata) <- c('Motif','Name','jaspar_family')

motifdata_fixed <- left_join(motifdata, select(TFClass_Full, Name=jaspar_name_1, TFClass_family=lowest_level_fam_1))
motifdata_fixed <- motifdata_fixed[!duplicated(motifdata_fixed),]
head(motifdata_fixed)
dim(motifdata_fixed)

head(mat_df)

# Plots

In [None]:
head(read.table(paste0(out_dir,"variable_devscores.csv"), header=TRUE, row.names=1, sep=","))

In [None]:
head(read.table(paste0(out_dir,"celltype_average_deviations.csv"), header=TRUE, row.names=1, sep=","))

In [None]:
info$groups <- sub("Control", "ND", info$groups)

wd_meta='/nfs/lab/parulk/HPAP_scATAC/'
meta.data =read.csv(paste0(wd_meta,'HPAP-scATAC-metadata-qc.csv'))
disease = meta.data[,c("Sample_Name","Disease","Age","Gender","BMI")]

disease_remove <- left_join(disease, select(info[!duplicated(info$samples),], Sample_Name=def, sample=samples)) %>%
                    filter(!is.na(sample))
disease_remove$sample <- as.character(disease_remove$sample)

head(disease_remove)

In [None]:
disease_remove

In [None]:
info$cluster_samples <- paste0(info$cluster, ":", info$samples)

In [None]:
head(info)

In [None]:
# Subset deviation scores by variability
variable_to_keep <- rownames(filter(variability, variability > 1.2))

var_subset_devscores <- devscores[variable_to_keep,]
dim(var_subset_devscores)
head(var_subset_devscores)

In [None]:
persamp_pertype =sapply(unique(info$cluster_samples), 
                function(i) rowMeans(as.data.frame(var_subset_devscores[info$cells[info$cluster_samples==i]]), na.rm=TRUE))

head(persamp_pertype)
                        
persamp_pertype_transform <- t(persamp_pertype)
persamp_pertype_transform <- as.data.frame(persamp_pertype_transform)
persamp_pertype_transform$Cell_Type <- str_split(rownames(persamp_pertype_transform), ":", simplify=TRUE)[,1]
persamp_pertype_transform$sample <- str_split(rownames(persamp_pertype_transform), ":", simplify=TRUE)[,2]
  
dim(persamp_pertype_transform)
head(persamp_pertype_transform)

motif_matrix <- dplyr::inner_join(persamp_pertype_transform,disease_remove, by = 'sample')
dim(motif_matrix)
head(motif_matrix)

In [None]:
write.table(motif_matrix, file=paste0(in_dir,'motif_matrix_boxplot.txt'),quote = FALSE, sep='\t')

In [None]:
ggplot(motif_matrix,aes(y=MA0081.2_SPIB,x=Cell_Type)) +
    geom_boxplot()

In [None]:
out_dir <- "/nfs/lab/parulk/HPAP_scATAC/motif_analysis/"

#write.csv(devscores, paste0(out_dir,"devscores.csv"))
write.csv(var_subset_devscores, paste0(out_dir,"variable_devscores.csv"))
write.csv(mat_df, paste0(out_dir,"celltype_average_deviations.csv"))

In [None]:
out_dir <- "/nfs/lab/parulk/HPAP_scATAC/motif_analysis/"

head(read.csv(paste0(out_dir,"variable_devscores.csv"), header=TRUE))

In [None]:
out_dir

In [None]:
head(read.table(paste0(out_dir,"variable_devscores.csv"), header=TRUE, row.names=1, sep=","))

In [None]:
head(read.table(paste0(out_dir,"celltype_average_deviations.csv"), header=TRUE, row.names=1, sep=","))

# Decided to plot just the top most variable deviation scores. Try a few thresholds.

In [None]:
var_1.2 <- rownames(filter(variability, variability > 1.2))
var_1.3 <- rownames(filter(variability, variability > 1.3))
var_1.4 <- rownames(filter(variability, variability > 1.4))
var_1.5 <- rownames(filter(variability, variability > 1.5))

In [None]:
top_50 <- rownames(arrange(variability, -variability)[1:50,])
top_100 <- rownames(arrange(variability, -variability)[1:100,])
top_150 <- rownames(arrange(variability, -variability)[1:150,])
top_200 <- rownames(arrange(variability, -variability)[1:200,])
top_250 <- rownames(arrange(variability, -variability)[1:250,])

In [None]:
#Remove cell types with less then 50 cells
mat_full = sapply(unique(info$cluster), 
                function(i) rowMeans(devscores[info$cells[info$cluster==i]], na.rm=TRUE))
                  
mat_full <- data.frame(mat_full)
                  
#mat_full <- select(mat_full, -Schwann, -MUC5b_Ductal, - Bcells, -Mast, -LymphEndo)
                  
head(mat_full)
dim(mat_full)

In [None]:
head(mat)
dim(mat)