In [2]:
library(tidyverse)
library(readr)
library(GenomicFeatures)
library(DESeq2)
library(org.Mm.eg.db)
library(rjson)
library(tximport)
library(DBI)
library(rje)
library(plyr)

codedir <- getwd()

In [3]:
######################################## Convert ENSEMBL ID to gene symbols ########################################
# Download convert table from:http://useast.ensembl.org/biomart/martview/8c1957c27101a044a318d51140a289e1

cv_file <- '/home/pipkin/references/mm_BioMart_GeneStableID_GeneName.txt'
cv_tb <- read_csv(cv_file)

matchGN <- function(input, outfilename, cvTb=cv_tb){
    colnames(input) <- c("ensembl_stable_ID", colnames(input)[2:length(colnames(input))])
    output <- cvTb %>% right_join(input, by="ensembl_stable_ID")
    output$ensembl_stable_ID <- NULL
    write_csv(output, outfilename)
}

Parsed with column specification:
cols(
  ensembl_stable_ID = [31mcol_character()[39m,
  gene_name = [31mcol_character()[39m
)



## 0. Prepare reference

In [4]:
###--- Make reference
#txdb <- makeTxDbFromGFF('/home/pipkin/references/GRCm38.99/Mus_musculus.GRCm38.99.gtf')
#saveDb(txdb, file='/home/pipkin/references/GRCm38.99/Mus_musculus.GRCm38.99')
mmRef <- '/home/pipkin/references/GRCm38.99/Mus_musculus.GRCm38.99'

###--- Convert transcript ID to gene ID
txdb <- loadDb(mmRef)
k <- keys(txdb, "GENEID")
res <- AnnotationDbi::select(txdb, k, "TXNAME", "GENEID")
tx2gene <- res[,2:1]

'select()' returned 1:many mapping between keys and columns



## 1. Differential comparison metadata (GEO_processed)

In [5]:
diff_comp_df <- read_csv("Differential_Comparisons.csv")
batches <- unique(diff_comp_df$Batch)

print(batches)

Parsed with column specification:
cols(
  Batch = [31mcol_character()[39m,
  Name = [31mcol_character()[39m,
  Celltype1 = [31mcol_character()[39m,
  Celltype2 = [31mcol_character()[39m
)



[1] "GSE72408"  "GSE68056"  "GSE132110" "GSE70813" 


## 2. Run differential analysis (GEO_processed)

In [6]:
salmon_out_dir <- '/media/pipkin/ROCKET-PRO/GEO_RNAseq/1_salmon_out'
deseq_out_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DEseq_out'

In [9]:
for (batch in batches){
    batch_info_file <- paste(codedir, "/", batch, "_SRR_Info.csv",sep="")
    batch_info_df <- read_csv(batch_info_file)

    diff_comp_df_batch <- diff_comp_df %>% filter(Batch == batch)
    use_celltypes <- unique(c(diff_comp_df_batch$Celltype1, diff_comp_df_batch$Celltype2)) 

    ###----- Sample name & condition setup
    colData <- batch_info_df %>% filter(Cell_type %in% use_celltypes) %>% dplyr::select(one_of(c('Run', 'Name', 'Cell_type')))
    conditions <- colData$Cell_type
    colData$Cell_type <- NULL
    colnames(colData) <- c("Samples", 'Cond')
    if (! is.subset(use_celltypes, conditions)) { # Check if all conditions are found
        print(paste("Missing condition in batch", batch, sep=" "))
        print("--------------------")
        print("Conditions to be used: ")
        print(use_celltypes)
        print("--------------------")
        print("Conditions found:")
        print(unique(conditions))
    }

    # Read files
    files <- file.path(salmon_out_dir,colData$Samples,"quant.sf")
    names(files) <- colData$Cond
    txi <- tximport(files, type="salmon", tx2gene=tx2gene, ignoreTxVersion = TRUE, dropInfReps = TRUE) # Drop in freps TURE = ignore verison  # Ignore TX verison stringsplits on . 

    # Build sample table
    sampleTable <- data.frame(condition = factor(conditions))
    rownames(sampleTable) <- colnames(txi$counts)

    #import into DESEQ2 framework
    dds <- DESeqDataSetFromTximport(txi, sampleTable, ~ condition)
    summary(dds)

    ###----- Run DEseq
    dds <- DESeq(dds)

    ###----- Write outputs
    print("--------------------")
    print(paste("Write outputs for ", batch, sep=""))
    for (i in c(1:nrow(diff_comp_df_batch))) {
        contrast_i <- c("condition", diff_comp_df_batch$Celltype1[i], diff_comp_df_batch$Celltype2[i])
        print(contrast_i[2:3])
        out_name_i <- paste(batch, "_", diff_comp_df_batch$Name[i], ".csv", sep="")
        out_name_i <- file.path(deseq_out_dir, out_name_i)
        out_name_i_gn <- gsub(".csv","_gn.csv",  out_name_i)
        results <- as_tibble(results(dds, contrast = contrast_i), rownames='ensembl_id')
        write_csv(results, out_name_i)
        matchGN(results, out_name_i_gn)
    }
}

Parsed with column specification:
cols(
  Run = [31mcol_character()[39m,
  Info = [31mcol_character()[39m,
  Cell_type = [31mcol_character()[39m,
  Name = [31mcol_character()[39m
)

reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 


transcripts missing from tx2gene: 175

summarizing abundance

summarizing counts

summarizing length

using counts and average transcript lengths from tximport

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



[1] "--------------------"
[1] "Write outputs for GSE72408"
[1] "WT"     "Zeb2KO"
[1] "WT"      "Tbx21KO"
[1] "Klrg1hi_WT"     "Klrg1hi_Zeb2KO"
[1] "Klrg1hi_WT"      "Klrg1hi_Tbx21OE"
[1] "Klrg1hi_Zeb2KO"         "Klrg1hi_Zeb2KO_Tbx21OE"
[1] "Klrg1hi_Tbx21OE"        "Klrg1hi_Zeb2KO_Tbx21OE"


Parsed with column specification:
cols(
  Run = [31mcol_character()[39m,
  Description = [31mcol_character()[39m,
  Info = [31mcol_character()[39m,
  Cell_type = [31mcol_character()[39m,
  Name = [31mcol_character()[39m
)

reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 


transcripts missing from tx2gene: 175

summarizing abundance

summarizing counts

summarizing length

using counts and average transcript lengths from tximport

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

-- replacing outliers and refitting for 566 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

estimating dispersions

fitting model and testing



[1] "--------------------"
[1] "Write outputs for GSE68056"
[1] "WT"      "Tbx21KO"
[1] "WT"       "Blimp1KO"
[1] "WT"      "Il2raKO"
[1] "Il2raKO"          "Il2raKO_Blimp1KO"
[1] "Blimp1KO"         "Il2raKO_Blimp1KO"


Parsed with column specification:
cols(
  Run = [31mcol_character()[39m,
  Info = [31mcol_character()[39m,
  Cell_type = [31mcol_character()[39m,
  Name = [31mcol_character()[39m
)

reading in files with read_tsv

1 
2 
3 
4 
5 
6 


transcripts missing from tx2gene: 175

summarizing abundance

summarizing counts

summarizing length

using counts and average transcript lengths from tximport

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



[1] "--------------------"
[1] "Write outputs for GSE132110"
[1] "Tim3pos" "Tim3neg"


Parsed with column specification:
cols(
  Run = [31mcol_character()[39m,
  Cell_description = [31mcol_character()[39m,
  Tissue = [31mcol_character()[39m,
  Genotype = [31mcol_character()[39m,
  Cell_type = [31mcol_character()[39m,
  Name = [31mcol_character()[39m
)

reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 
22 
23 
24 
25 
26 
27 
28 


transcripts missing from tx2gene: 175

summarizing abundance

summarizing counts

summarizing length

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbers, and delimiters '_' or '.', as these are safe characters

using counts and average transcript lengths from tximport

estimating size factors

  Note: levels of factors in the design contain characters other than
  letters, numbers, '_' and '.'. It is recommended (but not required) to use
  only letters, numbe

[1] "--------------------"
[1] "Write outputs for GSE70813"
[1] "WT"       "Hobit_KO"
[1] "WT"        "Blimp1_KO"
[1] "Hobit_KO"        "Hobit-Blimp1_KO"
[1] "Blimp1_KO"       "Hobit-Blimp1_KO"
[1] "Sp_TCM" "Sp_TEM"
[1] "Liver_TEM" "Liver_TRM"
[1] "Sp_TCM"    "Liver_TRM"
[1] "Sp_TCM"    "Liver_TEM"
[1] "Sp_TCM"   "Skin_TRM"
[1] "Sp_TCM"  "Gut_TRM"
[1] "Sp_TEM"    "Liver_TRM"
[1] "Sp_TEM"    "Liver_TEM"
[1] "Sp_TEM"   "Skin_TRM"
[1] "Sp_TEM"  "Gut_TRM"


## 3. Create GSEA signature (GEO_processed)

In [15]:
wk_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/GSEA'
setwd(wk_dir)

deseq_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DEseq_out'
gn_DEseq_files <- list.files(deseq_dir,pattern="*gn.csv", full.name=TRUE)

In [17]:
gs_name <- c()
gene_symbol <- c()

file_i <- gn_DEseq_files[1]

for (file_i in gn_DEseq_files){
    file_i_name_simp <- basename(file_i)
    file_i_name_simp <- gsub("_gn.csv", "", file_i_name_simp)
    file_i_name_simp <- gsub("_vs_", "_", file_i_name_simp)
    file_i_name_simp <- gsub("---", "_", file_i_name_simp)
    file_i_name_simp_vec <- unlist(strsplit(file_i_name_simp, "_"))
    file_i_name_cp <- tail(file_i_name_simp_vec, 2)

    gs_name_1 <- paste(gsub("_gn.csv", "", basename(file_i)), "---", file_i_name_cp[1], sep="")
    gs_name_2 <- paste(gsub("_gn.csv", "", basename(file_i)), "---", file_i_name_cp[2], sep="")

    df_i <- read_csv(file_i)
    
    df_i_up <- df_i %>% filter(padj <= 0.05) %>% filter(log2FoldChange >= 1)
    df_i_dn <- df_i %>% filter(padj <= 0.05) %>% filter(log2FoldChange <= -1)

    gene_symbol <- c(gene_symbol, df_i_up$gene_name)
    gs_name <- c(gs_name, rep(gs_name_1, nrow(df_i_up)))

    gene_symbol <- c(gene_symbol, df_i_dn$gene_name)
    gs_name <- c(gs_name, rep(gs_name_2, nrow(df_i_dn)))
}

### Create dataframe and summarize
gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GEO_processed/GEO_signatures.csv")
write_csv(count_df, "GEO_processed/GEO_sigantures_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_dou

*Select signatures to use*

In [20]:
wk_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/GSEA'
setwd(wk_dir)

use_sig <- 'GEO_processed/gs_use.csv'
use_sig_df <- read_csv(use_sig)

gsea_df <- read_csv("GEO_processed/GEO_signatures.csv")
gsea_df_use <- gsea_df %>% filter(gs_name %in% use_sig_df$gs_name)

write_csv(gsea_df_use, "GSEA_to_be_combined/GEO_signatures_use.csv")

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)



## 4. Create signatures for DE from supplementary data of publications

In [21]:
wk_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/GSEA'
setwd(wk_dir)

deseq_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP'

### WangDapeng - Runx3

In [33]:
DW_gn_DEseq_files <- list.files(deseq_dir,pattern="WangDapeng*", full.name=TRUE)

d5_KLRG1lo <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/WangDapeng_D5EEC---Runx3KO_vs_WT_gn.csv'
d5_KLRG1hi <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/WangDapeng_D5SLEC---Runx3KO_vs_WT_gn.csv'

d5_KLRG1lo_df <- read_csv(d5_KLRG1lo)
d5_KLRG1hi_df <- read_csv(d5_KLRG1hi)

d5_KLRG1lo_Runx3_promoted <- d5_KLRG1lo_df %>% dplyr::filter(padj <= 0.05) %>% 
dplyr::filter(log2FoldChange <= -1) %>% .$gene_name
d5_KLRG1lo_Runx3_repressed <- d5_KLRG1lo_df %>% dplyr::filter(padj <= 0.05) %>% 
dplyr::filter(log2FoldChange >= -1) %>% .$gene_name

d5_KLRG1hi_Runx3_promoted <- d5_KLRG1hi_df %>% dplyr::filter(padj <= 0.05) %>% 
dplyr::filter(log2FoldChange <= -1) %>% .$gene_name
d5_KLRG1hi_Runx3_repressed <- d5_KLRG1hi_df %>% dplyr::filter(padj <= 0.05) %>% 
dplyr::filter(log2FoldChange >= -1) %>% .$gene_name

gs_name <- c(rep("d5_KLRG1lo_Runx3_promoted", length(d5_KLRG1lo_Runx3_promoted)), 
             rep("d5_KLRG1lo_Runx3_repressed", length(d5_KLRG1lo_Runx3_repressed)),
             rep("d5_KLRG1hi_Runx3_promoted", length(d5_KLRG1hi_Runx3_promoted)), 
             rep("d5_KLRG1hi_Runx3_repressed", length(d5_KLRG1hi_Runx3_repressed)))
gene_symbol <- c(d5_KLRG1lo_Runx3_promoted, d5_KLRG1lo_Runx3_repressed, 
                 d5_KLRG1hi_Runx3_promoted, d5_KLRG1hi_Runx3_repressed)

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/WangDapeng_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/WangDapeng_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  baseMean = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m,
  lfcSE = [32mcol_double()[39m,
  stat = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)



### BeltraJeanChristophe - Tex subsets

In [34]:
BeltraJeanChristophe_subset_files <- list.files(deseq_dir,pattern="BeltraJeanChristophe*", full.name=TRUE)

gs_name <- c()
gene_symbol <- c()

for (file_i in BeltraJeanChristophe_subset_files){
    i_df <- read_csv(file_i) %>% mutate(rowSum = rowSums(.[2:4]))
    
    i_name <- gsub(".csv","",basename(file_i))
    i_name <- tail(unlist(strsplit(i_name, "_")), 1)
    i_name_up <- paste(i_name, "up", sep="_")
    i_name_dn <- paste(i_name, "dn", sep="_")
    
    i_up <- i_df %>% filter(rowSum >= 2) %>% .$gene_name
    i_dn <- i_df %>% filter(rowSum <= -2) %>% .$gene_name
    
    gs_name <- c(gs_name, rep(i_name_up, length(i_up)), rep(i_name_dn, length(i_dn)))
    gene_symbol <- c(gene_symbol, i_up, i_dn)
}

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/BeltraJeanChristophe_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/BeltraJeanChristophe_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `Texint vs Texprog1` = [32mcol_double()[39m,
  `Texint vs Texprog2` = [32mcol_double()[39m,
  `Texint vs Texterm` = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `Texprog1 vs Texprog2` = [32mcol_double()[39m,
  `Texprog1 vs Texint` = [32mcol_double()[39m,
  `Texprog1 vs Texterm` = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `Texprog2 vs Texprog1` = [32mcol_double()[39m,
  `Texprog2 vs Texint` = [32mcol_double()[39m,
  `Texprog2 vs Texterm` = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `Texterm vs Texprog1` = [32mcol_double()[39m,
  `Texterm vs Texprog2` = [32mcol_double()[39m,
  `Texterm vs Texint` = [32mcol_double()[39m
)



### KhanOmar - Tox

In [35]:
KhanOmar_files <- list.files(deseq_dir,pattern="KhanOmar*", full.name=TRUE)

Tox_KO_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/KhanOmar_ToxKO_vs_WT_gn.csv'
Tox_OE_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/KhanOmar_ToxOE_vs_WT_gn.csv'

gs_name <- c()
gene_symbol <- c()

Tox_KO_up <- read_csv(Tox_KO_file) %>% dplyr::filter(padj <= 0.05) %>% dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
Tox_KO_dn <- read_csv(Tox_KO_file) %>% dplyr::filter(padj <= 0.05) %>% dplyr::filter(log2FoldChange <= -1) %>% .$gene_name
Tox_OE_up <- read_csv(Tox_OE_file) %>% dplyr::filter(padj <= 0.05) %>% dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
Tox_OE_dn <- read_csv(Tox_OE_file) %>% dplyr::filter(padj <= 0.05) %>% dplyr::filter(log2FoldChange <= -1) %>% .$gene_name

Tox_promoted <- c(Tox_KO_dn, Tox_OE_up)
Tox_repressed <- c(Tox_KO_up, Tox_OE_dn)

gs_name <- c(rep("Tox_promoted", length(Tox_promoted)), rep("Tox_repressed", length(Tox_repressed)))
gene_symbol <- c(Tox_promoted, Tox_repressed)

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/KhanOmar_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/KhanOmar_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  log2FoldChange = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  log2FoldChange = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  log2FoldChange = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  log2FoldChange = [32mcol_double()[39m,
  padj = [32mcol_double()[39m
)



### WuTuoqi - Tcf7

In [36]:
Wu_DEseq_files <- list.files(deseq_dir,pattern="WuTuoqi*", full.name=TRUE)

Tcf1oe_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/WuTuoqi_TCF1OE_vs_WT_gn.csv'
Tcf7ko_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/WuTuoqi_Tcf7KO_vs_WT_gn.csv'
Tim3Blimp1_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/WuTuoqi_Tim3negBlimp1neg_vs_Tim3posBlimp1pos_gn.csv'

Tcf1oe_tb <- read_csv(Tcf1oe_file)
Tcf7ko_tb <- read_csv(Tcf7ko_file)
Tim3Blimp1_tb <- read_csv(Tim3Blimp1_file)

Tcf1_promoted <- Tcf1oe_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
Tcf1_repressed <- Tcf1oe_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange <= -1) %>% .$gene_name
Tcf7ko_promoted <- Tcf7ko_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
Tcf7ko_repressed <- Tcf7ko_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange <= -1) %>% .$gene_name

Tcf1_promoted <- c(Tcf1_promoted, Tcf7ko_repressed)
Tcf1_repressed <- c(Tcf1_repressed, Tcf7ko_promoted)

Tim3negBlimp1neg <- Tim3Blimp1_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
Tim3posBlimp1pos <- Tim3Blimp1_tb %>% dplyr::filter(`p-value` <= 0.05) %>% dplyr::filter(log2FoldChange <= -1) %>% .$gene_name

gs_name <- c(rep("Tcf7_promoted", length(Tcf1_promoted)), 
            rep("Tcf7_repressed", length(Tcf1_repressed)),
            rep("Tim3negBlimp1neg", length(Tim3negBlimp1neg)),
            rep("Tim3posBlimp1pos", length(Tim3posBlimp1pos)))

gene_symbol <- c(Tcf1_promoted, Tcf1_repressed, Tim3negBlimp1neg, Tim3posBlimp1pos)

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/WuTuoqi_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/WuTuoqi_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `p-value` = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `p-value` = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  `p-value` = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m
)



### KurdNadia - TRM v.s. TE precursor

In [37]:
KurdNadia_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/KurdNadia_Cluster16_vs_Cluster20_gn.csv'
KN_df <- read_csv(KurdNadia_file)

TRMbiased <- KN_df %>% dplyr::filter(log10pval < -1.3) %>% dplyr::filter(log2FoldChange >= 0.5) %>% .$gene_name
TEbiased <- KN_df %>% dplyr::filter(log10pval < -1.3) %>% dplyr::filter(log2FoldChange <= -0.5) %>% .$gene_name

gs_name <- c(rep("TRM_biased", length(TRMbiased)), rep("TE_biased", length(TEbiased)))
gene_symbol <- c(TRMbiased, TEbiased)

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/KurdNadia_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/KurdNadia_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  Cluster_16_Mean_TPM = [32mcol_double()[39m,
  Cluster_20_Mean_TPM = [32mcol_double()[39m,
  log10pval = [32mcol_double()[39m,
  log2FoldChange = [32mcol_double()[39m
)

“1 parsing failure.
 row       col expected actual                                                                                                             file
3325 log10pval a double    N/A '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/KurdNadia_Cluster16_vs_Cluster20_gn.csv'
”


## Preston Gavin - Klf2

In [40]:
klf2_file <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/DE_from_SUP/PrestonGavin_Klf2_vs_WT_gn.csv'
klf2_tb <- read_csv(klf2_file)

klf2_promoted <- klf2_tb %>% dplyr::filter(pvalue <= 0.05) %>% 
dplyr::filter(log2FoldChange >= 1) %>% .$gene_name
klf2_repressed <- klf2_tb %>% dplyr::filter(pvalue <= 0.05) %>% 
dplyr::filter(log2FoldChange <= -1) %>% .$gene_name

gs_name <- c(rep("klf2_promoted", length(klf2_promoted)), rep("klf2_repressed", length(klf2_repressed)))
gene_symbol <- c(klf2_promoted, klf2_repressed)

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "GSEA_to_be_combined/PrestonGavin_GSEA_signatures.csv")
write_csv(count_df, "GSEA_to_be_combined/count_summary/PrestonGavin_GSEA_count.csv")

Parsed with column specification:
cols(
  gene_name = [31mcol_character()[39m,
  log2FoldChange = [32mcol_double()[39m,
  FoldChange = [32mcol_double()[39m,
  pvalue = [32mcol_double()[39m
)



## 5. Merge all signatures

In [44]:
wk_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/GSEA'
setwd(wk_dir)


input_dir <- '/media/pipkin/Yolanda/Exp391_Acute-Chronic_SC/z_References/GSEA/GSEA_to_be_combined'
signature_files <- list.files(input_dir,pattern="*.csv", full.name=TRUE)

# Merge with gsea signatures curated previously
gs.old <- "/media/pipkin/Yolanda/Exp334CD25KOSc/source/GSEA/all_GSEA_20200205.csv"
signature_files <- c(signature_files, gs.old)
gs_name <- c()
gene_symbol <- c()
for (file_i in signature_files){
    i_df <- read_csv(file_i)
    gs_name <- c(gs_name, i_df$gs_name)
    gene_symbol <- c(gene_symbol, i_df$gene_symbol)
}

gsea_df <- as.tibble(data.frame(gs_name, gene_symbol))
count_df <- count(gsea_df$gs_name)
colnames(count_df) <- c("gs_name", "gene_number")

write_csv(gsea_df, "Combined_GEO_GSEA_signatures.csv")
write_csv(count_df, "Combined_GEO_GSEA_count.csv")


Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  gs_name = [31mcol_character()[39m,
  gene_symbol = [31mcol_character()[39m
)

