In [6]:
Output = ('/Users/alexis/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/CEMALB_DataAnalysisPM/Projects/P1011. Emission Mixtures/P1011.3. Analyses/P1011.3.1. Biomarker Distribution Analysis/Output')
cur_date = "042523"

library(readxl)
library(tidyverse)
library(reshape2)
library(stats)

# reading in files
proteomics_5_df = data.frame(read_excel("Input/Proteomics_Data_5ug_040423.xlsx", sheet = 2))
proteomics_25_df = data.frame(read_excel("Input/Proteomics_Data_25ug_040423.xlsx", sheet = 2))
demographics_df = data.frame(read_excel("Input/Subject_Info_031723.xlsx", sheet = 2))

Testing for statistical differences by comparing a protein expression between burn conditions (control, smoldering or flaming). (The condensate and concentration would remain consistent). Protein expression (intensity) was pseudo log transformed to reduce skew.

A non-parametric test, a Quade's ANCOVA, will be performed to control for sex as a covariate.

In [7]:
head(proteomics_25_df)
head(demographics_df)

Unnamed: 0_level_0,Subject_No,Subject_ID,Protein_Accession,Gene_Name,Description,Condensate,Burn_Condition,Concentration,Intensity,Intensity_pslog2
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,1,F_1,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,1488600000,30.47131
2,2,M_2,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,3806600000,31.82586
3,3,M_3,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,3900000000,31.86083
4,4,F_4,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,5373200000,32.32313
5,5,F_5,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,2595200000,31.2732
6,6,M_6,CO3_HUMAN,C3,Complement C3 OS=Homo sapiens OX=9606 GN=C3 PE=1 SV=2,C,F,25,5780700000,32.4286


Unnamed: 0_level_0,Original_Subject_ID,Subject_ID,Subject_No,Sex,Age,Race,Ethnicity
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>
1,68J,F_1,1,F,18,B,NH
2,53K,M_2,2,M,32,B,NH
3,21O,M_3,3,M,34,W,H
4,63O,F_4,4,F,32,W,H
5,67L,F_5,5,F,17,W,H
6,57N,M_6,6,M,59,W,NH


In [8]:
# combining data, but first making sex a factor
demographics_df = demographics_df %>%
    mutate(Sex = relevel(factor(ifelse(Sex == "M", 1, 0)), ref = "0"))

proteomics_25_sex_df = inner_join(proteomics_25_df[,-c(4:5)], demographics_df[,c(2,4)]) %>%
    # scaling the concentration values by protein normalizes the distribution
    group_by(Protein_Accession) %>%
    mutate(Scaled_Intensity = scale(Intensity_pslog2))
proteomics_5_sex_df = inner_join(proteomics_5_df[,-c(4:5)], demographics_df[,c(2,4)]) %>%
    # scaling the concentration values by protein normalizes the distribution
    group_by(Protein_Accession) %>%
    mutate(Scaled_Intensity = scale(Intensity_pslog2))

# putting burn condition into a factor
proteomics_25_sex_df$Burn_Condition = factor(proteomics_25_sex_df$Burn_Condition, levels = c("PBS", "S", "F"))
proteomics_5_sex_df$Burn_Condition = factor(proteomics_5_sex_df$Burn_Condition, levels = c("PBS", "S", "F"))

head(proteomics_25_sex_df)

[1m[22mJoining, by = "Subject_ID"
[1m[22mJoining, by = "Subject_ID"


Subject_No,Subject_ID,Protein_Accession,Condensate,Burn_Condition,Concentration,Intensity,Intensity_pslog2,Sex,Scaled_Intensity
<dbl>,<chr>,<chr>,<chr>,<fct>,<dbl>,<dbl>,<dbl>,<fct>,"<dbl[,1]>"
1,F_1,CO3_HUMAN,C,F,25,1488600000,30.47131,0,-1.29273172
2,M_2,CO3_HUMAN,C,F,25,3806600000,31.82586,1,-0.04856119
3,M_3,CO3_HUMAN,C,F,25,3900000000,31.86083,1,-0.0164397
4,F_4,CO3_HUMAN,C,F,25,5373200000,32.32313,0,0.40819624
5,F_5,CO3_HUMAN,C,F,25,2595200000,31.2732,0,-0.55618474
6,M_6,CO3_HUMAN,C,F,25,5780700000,32.4286,1,0.5050652


In [4]:
# contrasts show what compartments are being compared in anova
# these comparisons aren't what we want so I changed them in the function below
contrasts(proteomics_25_sex_df$Burn_Condition)

Unnamed: 0,S,F
PBS,0,0
S,1,0
F,0,1


               Table of Contrasts 
>               PBS |  S  |  F  |  Sum
>  - Contrast 1 |  -1  |  1  |  0  |   0
>  - Contrast 2 |  -1  |  0  |  1  |   0

> - Contrast 1: compares PBS to smoldering
> - Contrast 2: compares PBS to flaming

In [None]:
get_anova = function(df){
    # first filtering the df and iterating through the condensate
    condensates = unique(df$Condensate)
    for (i in 1:length(condensates)){
        filtered_df = df %>%
            filter(Condensate == condensates[i])
        
        contrasts(filtered_df$Burn_Condition) = cbind(c(-1,1,0),c(-1,0,1)) # meaning is specified above
        anova = aov(Scaled_Intensity ~ Burn_Condition, data = filtered_df)
        
        # Tukey's post hoc test
        tukeys = summary(glht(anova, linfct = mcp(Group = "Tukey")), test = adjusted("none"))
        print(tukeys$stat[3:2])
    }
    
    return(anova)
}

# calling fn


In [5]:
quade_test_v1 = function(df){
    # """
    # Running quade test after filtering for a protein and condensate using a loop and controlling for sex. 
    # Ultimately using this test to compare protein expression (control vs. flaming burn condition).

    # :param: protein dataframe
    # :output: a dataframe containing the protein, condensate, comparison, stat, p value, p adj

    # """
    
    # variables that will be iterated through
    proteins = unique(df$Protein_Accession)
    condensates = c("C", "P")
    
    values_df = data.frame()
     # iterating through each protein, condensate, and burn condition
    for(i in 1:length(proteins)){
        for(j in 1:length(condensates)){
                
            # control df
            control_df = df %>%
                filter(Protein_Accession == proteins[i], Condensate == "PBS")

            # flaming df
            flaming_df = df %>%
                filter(Protein_Accession == proteins[i], Condensate == condensates[j], Burn_Condition == "F")

            # quade's test
            quade_test = quade.test(Intensity_pslog2 ~ )

            # contains gene, condensate, burn conditions, conc, stat, and p value
            values_vector = cbind(proteins[i], condensates[j], "Control vs. Flaming",
                                  unique(df
statistic, quade_test$p.value)
            values_df = rbind(values_df, values_vector)
        }
    }

    
    # adding col names
    colnames(values_df) = c("Protein", "Condensate", "Comparison", "Concentration", "Statistic", "P Value")
    
    # calculating padj values
    values_df$`P Adj` = p.adjust(as.numeric(as.character(values_df$`P Value`)), method = "fdr")

    
    return(values_df)
}

ERROR: Error in parse(text = x, srcfile = src): <text>:29:56: unexpected ')'
28:             # quade's test
29:             quade_test = quade.test(Intensity_pslog2 ~ )
                                                           ^
