In [1]:
Output = ('/Users/alexis/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/CEMALB_DataAnalysisPM/Projects/P1011. Emission Mixtures/P1011.3. Analyses/P1011.3.2. Biomarker Distribution Analysis/Output')
cur_date = "042723"

library(readxl)
library(tidyverse)
library(reshape2)

# reading in files
mRNA_df = data.frame(read_excel("Input/Imputed_mRNA_Data_042623.xlsx"))

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0      [32m✔[39m [34mpurrr  [39m 0.3.4 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.2      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths




In [2]:
head(mRNA_df)

Unnamed: 0_level_0,Subject_ID,Condensate,Burn_Condition,Concentration,Time_Point,mRNA,ddCT_pslog2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>
1,M_6,PBS,PBS,,24,HMOX1,4.871886
2,M_6,PBS,PBS,,24,ALDH3A1,4.837012
3,M_6,PBS,PBS,,24,CXCL1,4.822372
4,M_6,PBS,PBS,,24,CXCR1,3.940873
5,M_6,PBS,PBS,,24,GCLC,4.863186
6,M_6,PBS,PBS,,24,GCLM,4.834127


Testing for statistical differences by comparing a mRNA expression between burn conditions (control, smoldering or flaming). (The condensate, concentration, and time point would remain consistent). mRNA expression (ddCT) was pseudo log transformed to reduce skew. The largest negative value (ie. -29) was added to each number and then log2 was taken.

Typically, normality and homogeneity of variances would be tested for first, however a non-parametric test (Wilcoxon rank sum) will be used given the small sample size (*n* = 6). 

In [3]:
# the mRNA df doesn't consistently test all 3 concentrations (1,5, or 25 micrograms) or 
# time points(4,24,72) therefore we can't subset the df using a loop so we'll create separate dfs
# for each concentration and time point
split_mRNA_df = mRNA_df %>%
    group_by(Concentration, Time_Point) %>%
    group_split()

conc1_24_mRNA_df = split_mRNA_df[[1]]
conc1_72_mRNA_df = split_mRNA_df[[2]]
conc25_4_mRNA_df = split_mRNA_df[[3]]
conc25_24_mRNA_df = split_mRNA_df[[4]]
conc25_72_mRNA_df = split_mRNA_df[[5]]
conc5_24_mRNA_df = split_mRNA_df[[6]]
conc5_72_mRNA_df = split_mRNA_df[[7]]
concc_4_mRNA_df = split_mRNA_df[[8]]
concc_24_mRNA_df = split_mRNA_df[[9]]
concc_72_mRNA_df = split_mRNA_df[[10]]

In [4]:
# control samples have a concentration of NA, so they're all in a separate df
# adding them back into the other dataframes
conc1_24_mRNA_df = unique(rbind(conc1_24_mRNA_df, concc_24_mRNA_df))
conc1_72_mRNA_df = unique(rbind(conc1_72_mRNA_df, concc_72_mRNA_df))
conc25_4_mRNA_df = unique(rbind(conc25_4_mRNA_df, concc_4_mRNA_df))
conc25_24_mRNA_df = unique(rbind(conc25_24_mRNA_df, concc_24_mRNA_df))
conc25_72_mRNA_df = unique(rbind(conc25_72_mRNA_df, concc_72_mRNA_df))
conc5_24_mRNA_df = unique(rbind(conc5_24_mRNA_df, concc_24_mRNA_df))
conc5_72_mRNA_df = unique(rbind(conc5_72_mRNA_df, concc_72_mRNA_df))

head(conc1_24_mRNA_df)

Subject_ID,Condensate,Burn_Condition,Concentration,Time_Point,mRNA,ddCT_pslog2
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>
M_6,P,F,1,24,HMOX1,4.774776
M_6,P,F,1,24,ALDH3A1,4.762372
M_6,P,F,1,24,CXCL1,4.768515
M_6,P,F,1,24,CXCR1,3.814665
M_6,P,F,1,24,GCLC,4.792038
M_6,P,F,1,24,GCLM,4.814864


In [5]:
wilcoxon_rank_sumv1 = function(df){
    # """
    # Running wilcoxon rank sums after filtering for gene and condensate using a loop. 
    # Ultimately using this test to compare gene expression (control vs. flaming burn condition).

    # :param: subsetted dataframe, empty dataframe
    # :output: a dataframe containing the gene, condensate, comparison, conc, time point, stat, p value, p adj

    # """
    
    # variables that will be iterated through
    genes = unique(df$mRNA)
    condensates = c("C", "P")
    
    values_df = data.frame()
     # iterating through each gene, condensate, and burn condition
    for(i in 1:length(genes)){
        for(j in 1:length(condensates)){
                
            # control df
            control_df = df %>%
                filter(mRNA == genes[i], Condensate == "PBS")

            # flaming df
            flaming_df = df %>%
                filter(mRNA == genes[i], Condensate == condensates[j] , Burn_Condition == "F")

            # wilcoxon rank sum
            # this would be a paired test however some subjects are in one group but not the other
            wilcoxon_test = wilcox.test(control_df$ddCT_pslog2, flaming_df$ddCT_pslog2, paired = TRUE)

            # contains gene, condensate, burn conditions, conc, time point, stat, and p value
            values_vector = cbind(genes[i], condensates[j], "Control vs. Flaming",
                                  unique(df$Concentration)[1], unique(df$Time_Point), 
                                  wilcoxon_test$statistic, wilcoxon_test$p.value)
            values_df = rbind(values_df, values_vector)
        }
    }
    
   # adding col names
    colnames(values_df) = c("mRNA", "Condensate", "Comparison", "Concentration", "Time Point",
                            "Statistic", "P Value")
    
       # calculating padj values
    PAdj = c()
    for(j in 1:length(condensates)){
        filtered_df = values_df %>%
            filter(Condensate == condensates[j])
        padj = p.adjust(as.numeric(as.character(filtered_df$`P Value`)), method = "fdr")
        PAdj = c(PAdj, padj)

    }
    
    values_df$`P Adj` = PAdj
    return(values_df)
}

In [6]:
# calling fn
conc1_24_wilcox_values_v1 = wilcoxon_rank_sumv1(conc1_24_mRNA_df)
conc1_72_wilcox_values_v1 = wilcoxon_rank_sumv1(conc1_72_mRNA_df)
conc25_4_wilcox_values_v1 = wilcoxon_rank_sumv1(conc25_4_mRNA_df)
conc25_24_wilcox_values_v1 = wilcoxon_rank_sumv1(conc25_24_mRNA_df)
conc25_72_wilcox_values_v1 = wilcoxon_rank_sumv1(conc25_72_mRNA_df)
conc5_24_wilcox_values_v1 = wilcoxon_rank_sumv1(conc5_24_mRNA_df)
conc5_72_wilcox_values_v1 = wilcoxon_rank_sumv1(conc5_72_mRNA_df)

In [7]:
# creating 1 df
smoldering_flaming_df = rbind(conc1_24_wilcox_values_v1, conc1_72_wilcox_values_v1, conc25_4_wilcox_values_v1,
                             conc25_24_wilcox_values_v1, conc25_72_wilcox_values_v1, conc5_24_wilcox_values_v1,
                             conc5_72_wilcox_values_v1)

In [8]:
wilcoxon_rank_sumv2 = function(df){
    # """
    # Running wilcoxon rank sums after filtering for gene and condensate using a loop. 
    # Ultimately using this test to compare gene expression (smoldering vs. flaming burn condition).

    # :param: subsetted dataframe, empty dataframe
    # :output: a dataframe containing the gene, condensate, comparison, conc, time point, stat, p value, p adj

    # """
    
    # variables that will be iterated through
    genes = unique(df$mRNA)
    condensates = c("C", "P")
    
    values_df = data.frame()
     # iterating through each gene, condensate, and burn condition
    for(i in 1:length(genes)){
        for(j in 1:length(condensates)){
                
            # control df
            control_df = df %>%
                filter(mRNA == genes[i], Condensate == condensates[j], Burn_Condition == "S")

            # flaming df
            flaming_df = df %>%
                filter(mRNA == genes[i], Condensate == condensates[j] , Burn_Condition == "F")

            # wilcoxon rank sum
            # this would be a paired test however some subjects are in one group but not the other
            wilcoxon_test = wilcox.test(control_df$ddCT_pslog2, flaming_df$ddCT_pslog2, paired = TRUE)

            # contains gene, condensate, burn conditions, conc, time point, stat, and p value
            values_vector = cbind(genes[i], condensates[j], "Smoldering vs. Flaming",
                                  unique(df$Concentration)[1], unique(df$Time_Point), 
                                  wilcoxon_test$statistic, wilcoxon_test$p.value)
            values_df = rbind(values_df, values_vector)
            
        }
    }

    # adding col names
    colnames(values_df) = c("mRNA", "Condensate", "Comparison", "Concentration", "Time Point",
                            "Statistic", "P Value")
    
    # calculating padj values
    PAdj = c()
    for(j in 1:length(condensates)){
        filtered_df = values_df %>%
            filter(Condensate == condensates[j])
        padj = p.adjust(as.numeric(as.character(filtered_df$`P Value`)), method = "fdr")
        PAdj = c(PAdj, padj)

    }
    
    values_df$`P Adj` = PAdj
    return(values_df)
}

In [9]:
# calling fn
conc1_24_wilcox_values_v2 = wilcoxon_rank_sumv2(conc1_24_mRNA_df)
conc1_72_wilcox_values_v2 = wilcoxon_rank_sumv2(conc1_72_mRNA_df)
conc25_4_wilcox_values_v2 = wilcoxon_rank_sumv2(conc25_4_mRNA_df)
conc25_24_wilcox_values_v2 = wilcoxon_rank_sumv2(conc25_24_mRNA_df)
conc25_72_wilcox_values_v2 = wilcoxon_rank_sumv2(conc25_72_mRNA_df)
conc5_24_wilcox_values_v2 = wilcoxon_rank_sumv2(conc5_24_mRNA_df)
conc5_72_wilcox_values_v2 = wilcoxon_rank_sumv2(conc5_72_mRNA_df)

In [10]:
# creating 1 df
control_flaming_df = rbind(conc1_24_wilcox_values_v2, conc1_72_wilcox_values_v2, conc25_4_wilcox_values_v2,
                             conc25_24_wilcox_values_v2, conc25_72_wilcox_values_v2, conc5_24_wilcox_values_v2,
                             conc5_72_wilcox_values_v2)

mRNA_wilcoxon_df = rbind(smoldering_flaming_df, control_flaming_df) %>%
    # changing condensates so they're more legible
    mutate(Condensate = ifelse(Condensate == "C", "Cardboard",
                               ifelse(Condensate == "P", "Plastic", "PBS")))

head(mRNA_wilcoxon_df)

Unnamed: 0_level_0,mRNA,Condensate,Comparison,Concentration,Time Point,Statistic,P Value,P Adj
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
V,HMOX1,Cardboard,Control vs. Flaming,1,24,6,0.4375,0.59375
V1,HMOX1,Plastic,Control vs. Flaming,1,24,6,0.4375,1.0
V2,ALDH3A1,Cardboard,Control vs. Flaming,1,24,10,1.0,0.6286765
V3,ALDH3A1,Plastic,Control vs. Flaming,1,24,2,0.09375,0.6286765
V4,CXCL1,Cardboard,Control vs. Flaming,1,24,7,0.5625,0.1484375
V5,CXCL1,Plastic,Control vs. Flaming,1,24,10,1.0,0.3710938


In [11]:
# exporting
write.csv(mRNA_wilcoxon_df, paste0(Output,"/", cur_date, "_mRNA_Wilcoxon_Results.csv"), row.names = FALSE)