In [24]:
library(readr)
library(GenomicFeatures)
library(DESeq2)
library(org.Mm.eg.db)
library(rjson)
library(tximport)
library(DBI)
library(rje)
library(plyr)
library(tidyverse)

In [25]:
code.dir <- getwd()
base.dir <- gsub('/codes', '', code.dir)
base.dir

In [150]:
wk.dir <- file.path(base.dir, "2_DEseq_out")
setwd(wk.dir)

## 2016_SCIENCE_Mackay

In [51]:
mackey.sp.info <- file.path(base.dir, '1_raw_count_table/2016_SCIENCE_Mackay_rawCount_sample_info.csv')
mackey.sp.info.df <- read_csv(mackey.sp.info)

mackey.srr.info <- file.path(base.dir, '1_raw_count_table/2016_SCIENCE_Mackay_rawCount_SRA_Run_Table.txt')
mackey.srr.info.df <- read_csv(mackey.srr.info) %>% select(one_of(c("GEO_Accession (exp)", "source_name", "Tissue")))
colnames(mackey.srr.info.df) <- c("GEO_Accession", "source_name", "tissue")

mackey.srr.info.df <- mackey.srr.info.df %>% left_join(mackey.sp.info.df)

# Write
out.name <- file.path(base.dir, '1_raw_count_table/2016_SCIENCE_Mackay_rawCount_ref.csv')
write_csv(mackey.srr.info.df, out.name)

Parsed with column specification:
cols(
  GEO_Accession = [31mcol_character()[39m,
  sp_name = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  .default = col_character(),
  AvgSpotLen = [32mcol_double()[39m,
  Bases = [32mcol_double()[39m,
  Bytes = [32mcol_double()[39m,
  ReleaseDate = [34mcol_datetime(format = "")[39m
)

See spec(...) for full column specifications.

Joining, by = "GEO_Accession"



In [132]:
mackey.anno <- file.path(base.dir, '1_raw_count_table/2016_SCIENCE_Mackay_rawCount_ref_annotated.csv')
mackey.anno.df <- read_csv(mackey.anno)

keys <- c()
for (i in mackey.anno.df$sp_name){
    i <- gsub("L003", "", i)
    i_vec <- unlist(strsplit(i, "_"))
    i_vec <- tail(i_vec,2)
    keys <- c(keys, paste(i_vec, collapse="_"))
}

mackey.anno.df$keys <- keys

mackey.anno.df.use <- mackey.anno.df %>% filter(tissue != "Bone marrow") %>% 
    filter(cond_abbr != "NK") %>%
    filter(cond_abbr != "Naive")
head(mackey.anno.df.use)

col.df <- mackey.anno.df.use %>% select(one_of('keys', 'cond_abbr')) %>% column_to_rownames(var="keys")
colnames(col.df) <- c("condition")

Parsed with column specification:
cols(
  GEO_Accession = [31mcol_character()[39m,
  source_name = [31mcol_character()[39m,
  tissue = [31mcol_character()[39m,
  sp_name = [31mcol_character()[39m,
  condition = [31mcol_character()[39m,
  sample_name = [31mcol_character()[39m,
  cond_abbr = [31mcol_character()[39m
)



GEO_Accession,source_name,tissue,sp_name,condition,sample_name,cond_abbr,keys
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GSM1819914,LCMV central memory CD8 T cells_spleen,Spleen,C2_C5N42ANXX_CGATGTAT,LCMV_Tcm_sp,LCMV_Tcm_sp_1,Tcm,C5N42ANXX_CGATGTAT
GSM1819915,LCMV effector memory CD8 T cells_spleen,Spleen,C3_C5N42ANXX_TTAGGCAT,LCMV_Tem_sp,LCMV_Tem_sp_1,Tem,C5N42ANXX_TTAGGCAT
GSM1819916,LCMV effector memory CD8 T cells_liver,Liver,C4_C5N42ANXX_TGACCAAT,LCMV_Tem_Liver,LCMV_Tem_Liver_1,Tem,C5N42ANXX_TGACCAAT
GSM1819917,LCMV tissue-resident memory CD8 T cells_liver,Liver,C5_C5N42ANXX_ACAGTGAT,LCMV_Trm_Liver,LCMV_Trm_Liver_1,Trm,C5N42ANXX_ACAGTGAT
GSM1819918,LCMV tissue-resident memory CD8 T cells_gut,Gut,C6_C5N42ANXX_GCCAATAT,LCMV_Trm_Gut,LCMV_Trm_Gut_1,Trm,C5N42ANXX_GCCAATAT
GSM1819923,LCMV central memory CD8 T cells_spleen,Spleen,D2_C5N42ANXX_GGCTACAT,LCMV_Tcm_sp,LCMV_Tcm_sp_2,Tcm,C5N42ANXX_GGCTACAT


In [133]:
mackay.file <- file.path(base.dir, '1_raw_count_table/2016_SCIENCE_Mackay_rawCount.txt')
mackay.df <- read_tsv(mackay.file)

new_colnames <- c()
for (i in colnames(mackay.df)[3:length(colnames(mackay.df))]){
    i <- gsub("mm10", "",i)
    i <- gsub(".bam", "",i)
    i <- gsub("L003", "",i)
    i_vec <- unlist(strsplit( i, "_"))
    i_vec <- tail(i_vec,2)
    new_colnames <- c(new_colnames, paste(i_vec, collapse="_"))
}

colnames(mackay.df) <- c(c("EntrezID", "Symbol"), new_colnames)

mackay.df <- mackay.df %>% select(one_of(c("Symbol"), mackey.anno.df.use$keys)) %>%
    distinct(Symbol, .keep_all=TRUE)  %>% drop_na(Symbol) %>%
    column_to_rownames(var="Symbol")


head(mackay.df)

Parsed with column specification:
cols(
  .default = col_double(),
  Symbol = [31mcol_character()[39m
)

See spec(...) for full column specifications.



Unnamed: 0_level_0,C5N42ANXX_CGATGTAT,C5N42ANXX_TTAGGCAT,C5N42ANXX_TGACCAAT,C5N42ANXX_ACAGTGAT,C5N42ANXX_GCCAATAT,C5N42ANXX_GGCTACAT,C5N42ANXX_CTTGTAAT,C5N42ANXX_AGTCAACA,C5N42ANXX_AGTTCCGT,C5N42ANXX_ATGTCAGA,C6GUTANXX_ATGTCAGA,C6GUTANXX_GTGAAACG,C6GUTANXX_AGTCAACA,C6GUTANXX_CCGTCCCG,C6GUTANXX_AGTTCCGT,C6GUTANXX_GTCCGCAC
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Xkr4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gm19938,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gm10568,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Rp1,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0
Sox17,0,4,0,0,0,0,1,3,0,0,5,0,0,0,0,0
Mrpl15,730,640,621,541,914,604,683,588,641,620,26,19,42,30,33,63


In [134]:
dds <- DESeqDataSetFromMatrix(countData = mackay.df,
                              colData = col.df,
                              design= ~  condition)

summary(dds)

dds <- DESeq(dds)

converting counts to integer mode

“some variables in design formula are characters, converting to factors”


estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [151]:
contrasts <- list(c("Tcm", "Tem"), c("Tcm", "Trm"), c("Tem","Trm"))
for (i in c(1:3)) {
    contrast_i <- contrasts[[i]]
    outname_i <- paste(paste(contrast_i, collapse="_vs_"), ".csv", sep="")
    outname_i <- paste("2016_SCIENCE_Mackay", outname_i, sep="--")
    results <- as_tibble(results(dds, contrast = c("condition", contrast_i)), rownames='Symbol')
    write_csv(results, outname_i)
}

## 2018_IMMUNITY_Wang

In [160]:
wang.sp.info <- file.path(base.dir, '1_raw_count_table/2018_IMMUNITY_Wang_sample_description.csv')
wang.sp.info.df <- read_csv(wang.sp.info)

wang.file <- file.path(base.dir, '1_raw_count_table/2018_IMMUNITY_Wang_rawCount.csv')
wang.df <- read_tsv(wang.file)

Parsed with column specification:
cols(
  Number = [32mcol_double()[39m,
  Date = [31mcol_character()[39m,
  `#` = [32mcol_double()[39m,
  Description = [31mcol_character()[39m,
  Degradation = [31mcol_character()[39m,
  `WT-KO` = [31mcol_character()[39m,
  condition = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  `gene_name,174_11,174_12,174_13,174_14,174_15,174_16,174_17,174_18,174_19,174_20,174_21,174_22,174_23,174_24,174_25,174_26,174_27,174_28,174_29,174_30` = [31mcol_character()[39m
)



In [161]:
wang.sp.info.df.use <- wang.sp.info.df %>% 
    filter(Degradation == "No") %>% 
    filter(`WT-KO` != "KO")

In [162]:
wang.sp.info.df.use

Number,Date,#,Description,Degradation,WT-KO,condition
<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
11,3/28/17,1442,WT2-Day5-EEC,No,WT,D5EE
12,3/28/17,1442,WT2-Day5-SLEC,No,WT,D5TE
15,3/28/17,1443,WT3-Day5-EEC,No,WT,D5EE
19,3/31/17,1442,WT2-Day8-EEC,No,WT,D8EE
20,3/31/17,1442,WT2-Day8-SLEC,No,WT,D8TE
21,3/31/17,1442,WT2-Day8-MPEC,No,WT,D8MP
25,3/31/17,1443,WT3-Day8-EEC,No,WT,D8EE
26,3/31/17,1443,WT3-Day8-SLEC,No,WT,D8TE
27,3/31/17,1443,WT3-Day8-MPEC,No,WT,D8MP
