In [1]:

list_of_packages <- c("dplyr","tidyr","biomaRt")
for(package in list_of_packages){
suppressPackageStartupMessages(suppressMessages(suppressWarnings(library(package,character.only=TRUE))))
}

In [2]:
genes <- c(
    "GSDMA",
    "GSDMB",
    "GSDMC",
    "GSDMD",
    "GSDME",
    "TLR4",
    "TLR5",
    "CASP1",
    "CASP2",
    "CASP3",
    "CASP4",
    "CASP5",
    "CASP6",
    "CASP7",
    "CASP8",
    "CASP9",
    "CASP10",
    "CASP11"
)

In [3]:
mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl",host = "www.ensembl.org", path = "/biomart/martservice")

“Ensembl will soon enforce the use of https.
Ensure the 'host' argument includes "https://"”


In [4]:
data_path <- file.path("../data/gtex_data")
data_files <- list.files(data_path, pattern = "gct", full.names = TRUE)

rna_df <- data.frame()
for (file in data_files){
    tmp_df <- read.table(file, header = TRUE, skip = 2, sep = "\t", stringsAsFactors = FALSE)
    # trim the decimal from the Name column

    tmp_df <- tmp_df[tmp_df$Description %in% genes,]
    
    # rename the Description column to geneID
    colnames(tmp_df)[2] <- "geneID"
    colnames(tmp_df)[1] <- "Ensembl"

    # drop the Name column
    tmp_df <- tmp_df[, !names(tmp_df) %in% "Name"]
    tmp_df <- tmp_df[, !names(tmp_df) %in% "Description"]
    # convert the GeneID to gene symbol
    gene_id <- unique(tmp_df$geneID)
    lookup <- getBM(
        mart = mart,
        attributes = c('transcript_length','hgnc_symbol'),
        filter = 'hgnc_symbol',
        values = gene_id,
        uniqueRows = TRUE
        )
    tmp_df <- merge(tmp_df, lookup, by.x="geneID", by.y="hgnc_symbol", all=TRUE)
    # make the df tidy long format
    tmp_df <- tmp_df %>% 
        pivot_longer(cols = -c(Ensembl, geneID,transcript_length), names_to = "Sample", values_to = "expression") %>%
        mutate(tissue = gsub("../data/gtex_data/gene_reads_v10_", "", file)) %>%
        mutate(tissue = gsub(".gct", "", tissue))
    rna_df <- rbind(rna_df, tmp_df)
}
head(rna_df)

geneID,Ensembl,transcript_length,Sample,expression,tissue
<chr>,<chr>,<int>,<chr>,<int>,<chr>
CASP1,ENSG00000137752.24,1693,GTEX.1117F.0226.SM.5GZZ7,300,adipose_subcutaneous
CASP1,ENSG00000137752.24,1693,GTEX.111CU.1826.SM.5GZYN,1155,adipose_subcutaneous
CASP1,ENSG00000137752.24,1693,GTEX.111FC.0226.SM.5N9B8,1382,adipose_subcutaneous
CASP1,ENSG00000137752.24,1693,GTEX.111VG.2326.SM.5N9BK,1615,adipose_subcutaneous
CASP1,ENSG00000137752.24,1693,GTEX.111YS.2426.SM.5GZZQ,585,adipose_subcutaneous
CASP1,ENSG00000137752.24,1693,GTEX.1122O.2026.SM.9YFMG,1317,adipose_subcutaneous


In [5]:
rna_df$fpkm <- rna_df$expression / rna_df$transcript_length

In [6]:
# save the compiled gtex data to a file

all_gtex_tissue_data <- file.path("../data/genes_of_interest_gtex_tissue_data.tsv")


write.table(rna_df, file = all_gtex_tissue_data, sep = "\t", row.names = FALSE)