In [1]:

list_of_packages <- c("dplyr","tidyr","biomaRt")
for(package in list_of_packages){
suppressPackageStartupMessages(suppressMessages(suppressWarnings(library(package,character.only=TRUE))))
}

In [2]:
genes <- c(
    "GSDMA",
    "GSDMB",
    "GSDMC",
    "GSDMD",
    "GSDME",
    "TLR4",
    "TLR5",
    "CASP1",
    "CASP2",
    "CASP3",
    "CASP4",
    "CASP5",
    "CASP6",
    "CASP7",
    "CASP8",
    "CASP9",
    "CASP10",
    "CASP11"
)
selected_genes <- TRUE

In [4]:
data_path <- file.path("../data/gtex_data")
data_files <- list.files(data_path, pattern = "gct", full.names = TRUE)

rna_df <- data.frame()
for (file in data_files){
    tmp_df <- read.table(file, header = TRUE, skip = 2, sep = "\t", stringsAsFactors = FALSE)
    # trim the decimal from the Name column
    if (selected_genes){
        tmp_df <- tmp_df[tmp_df$Description %in% genes,]
    }
    # rename the Description column to geneID
    colnames(tmp_df)[2] <- "geneID"
    colnames(tmp_df)[1] <- "Ensembl"

    # drop the Name column
    tmp_df <- tmp_df[, !names(tmp_df) %in% "Name"]
    tmp_df <- tmp_df[, !names(tmp_df) %in% "Description"]
    # convert the GeneID to gene symbol
    mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")
    gene_id <- unique(tmp_df$Ensembl)

    lookup <- getBM(
        mart = mart,
        attributes = c('transcript_length','ensembl_gene_id'),
        filter = 'ensembl_gene_id',
        values = gene_id,
        uniqueRows = TRUE
        )
    tmp_df <- merge(tmp_df, lookup, by.x="Ensembl", by.y="ensembl_gene_id", all=TRUE)
    # make the df tidy long format
    tmp_df <- tmp_df %>% 
        pivot_longer(cols = -c(Ensembl, geneID,transcript_length), names_to = "Sample", values_to = "expression") %>%
        mutate(tissue = gsub("../data/gtex_data/gene_reads_v10_", "", file)) %>%
        mutate(tissue = gsub(".gct", "", tissue))

    rna_df <- rbind(rna_df, tmp_df)
}
head(rna_df)

ERROR: Error in curl::curl_fetch_memory(url, handle = handle): Timeout was reached: [www.ensembl.org:443] Operation timed out after 10004 milliseconds with 0 bytes received


In [None]:
# save the compiled gtex data to a file
if (selected_genes){
    all_gtex_tissue_data <- file.path("../data/genes_of_interest_gtex_tissue_data.tsv")
} else {
    all_gtex_tissue_data <- file.path("../data/all_gtex_tissue_data.tsv")
}

write.table(rna_df, file = all_gtex_tissue_data, sep = "\t", row.names = FALSE)