# Hypoxia gene scoring

We followed the gene scoring method of Bhandari et al., 2020. The median mRNA expression level across our bulk RNA-seq samples was computed for every gene for hypoxia gene signatures. Samples with mRNA expression levels greater than the gene’s associated median were assigned a score of +1 and those with expression levels below the median a score of -1. The sum of these individual scores for each sample across the three signatures in a given gene was the resulting gene’s score.

In [None]:
library(dplyr)
library(data.table)
library(gdata)
library(tidyr)
library(biomaRt)
library(ggplot2)
library(ggpubr)
library(rstatix)

In [None]:
# use normalized gene expression from samples
rnaseq <- read.csv('TMM_Voom_Normalized_Gene_expression_salmon_20230828.csv', header = TRUE) 
row.names(rnaseq) <- rnaseq$X
rnaseq <- rnaseq[, -1]

#row names are the ensembl ids that you want to map to gene symbols
ensembl_ids <- rownames(rnaseq)
head(rnaseq)

In [None]:
#read in the text file containing gene signatures
buffa_gene_signatures <- read.table("buffa_hypoxia_gene_signatures.txt", header = FALSE, stringsAsFactors = FALSE)
buffa_gene_signatures <- buffa_gene_signatures$V1

print(buffa_gene_signatures)

In [None]:
#converting ENSEMBLIDs into gene names for scoring
ensembl <- useMart("ensembl")
dataset <- useDataset("hsapiens_gene_ensembl", mart = ensembl)
ensemble_genenames <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
                    filters = "ensembl_gene_id",
                    values = ensembl_ids,
                    mart = dataset)


In [None]:
#ensemble_genenames will be a dataframe that maps the ensemblid to corresponding gene names
head(ensemble_genenames, n = 10)

In [None]:
#initializing empty dataframe with sample names as rows and gene as columns
buffa_hypoxia_scoring_df <- data.frame(matrix(NA, nrow = ncol(rnaseq), ncol = length(buffa_gene_signatures)))
rownames(buffa_hypoxia_scoring_df) <- colnames(rnaseq)
colnames(buffa_hypoxia_scoring_df) <- buffa_gene_signatures

head(buffa_hypoxia_scoring_df[, 1:5], n = 10)

In [None]:
#adding scores for each gene in the dataframe, BUFFA SIGNATURES ONLY
for (i in seq_along(buffa_gene_signatures)) {
  signature <- buffa_gene_signatures[[i]]
  ensembl_id <- ensemble_genenames$ensembl_gene_id[ensemble_genenames$external_gene_name == signature]
  
  # check if ensembl_id is empty, edge case where gene might not be included in rnaseq data
  if (length(ensembl_id) == 0) {
    next  #skip this gene
  }
    
  # calculate the median mRNA abundance for the gene across all samples
  median_abundance <- apply(rnaseq[ensembl_id, ], 1, median)
  
  # Assign scores based on abundance values
  for (j in 1:ncol(rnaseq)) {
    if (rnaseq[ensembl_id, j] >= median_abundance) {
      buffa_hypoxia_scoring_df[j, signature] <- 1
    } else {
      buffa_hypoxia_scoring_df[j, signature] <- -1
    }
  }
}

head(buffa_hypoxia_scoring_df, n = 10)
print(dim(buffa_hypoxia_scoring_df))

#checking which columns have NA
na_columns <- apply(is.na(buffa_hypoxia_scoring_df), 2, any)
names_with_missing <- names(na_columns)[na_columns]
names_with_missing

#remove columns with NA
na_columns <- apply(is.na(buffa_hypoxia_scoring_df), 2, any)
buffa_hypoxia_scoring_filtered <- buffa_hypoxia_scoring_df[, !na_columns]
head(buffa_hypoxia_scoring_filtered, n = 10)

#sum up the scores across genes for all samples and add a column indicating the hypoxia score
row_sums <- rowSums(buffa_hypoxia_scoring_filtered)

# add row sums as new column at beginning of dataframe
buffa_hypoxia_scoring <- cbind("HypoxiaScoreBuffa" = row_sums, buffa_hypoxia_scoring_filtered)
head(buffa_hypoxia_scoring, n = 10)

In [None]:
#read in metadata file
metadata <- read.csv('Metadata_gene_expression_112samples.csv', header = TRUE) 
row.names(metadata) <- metadata$X
metadata <- metadata[, -1]

#first check that ordering of samples in rnaseq data is the same as ordering of samples in metadata
print(identical(rownames(buffa_hypoxia_scoring), rownames(metadata)))

In [None]:
#add hypoxia score column to metadata table
metadata$HypoxiaScoreBuffa <- buffa_hypoxia_scoring$HypoxiaScoreBuffa

metadata <- metadata[, c("HypoxiaScoreBuffa", setdiff(names(metadata), "HypoxiaScoreBuffa"))]
head(metadata, n = 20)

In [None]:
#export data into a csv file
write.csv(metadata, file = "Metadata_gene_expression_112samples_hypoxia.csv", row.names = TRUE)