In [1]:
setwd('/Users/alexis/IEHS Dropbox/Rager Lab/Alexis_Payton/4_Compartment_Analysis_2021/Expt1.1_Cluster Distribution Analyses/Input')
Output = ('/Users/alexis/IEHS Dropbox/Rager Lab/Alexis_Payton/4_Compartment_Analysis_2021/Expt1.1_Cluster Distribution Analyses/Output')
cur_date = "041221"

library(readxl)
library(data.table)
library(ggplot2)
library(factoextra)
library(janitor)
library(dplyr)
library(tidyverse)
library(gridExtra)
library(cluster)
library(vegan)
library(fpc)
library(ggdendro)

#reading in file
cytokines <- data.frame(read_excel("CytokineData_032521.xlsx", sheet = 2))
subjects = data.frame(read_excel("SubjectInfo_032521.xlsx", sheet = 2))

#cluster assignments 
NELF_clus <- data.frame(read_excel("041221cytokines_cluster_assignments.xlsx"))

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test



Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mtibble [39m 3.0.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖

Assigning the same baseline clusters to e-cig and cigarette smokers and running wilcoxon rank sum tests to see if they differ.

# Eigencytokines

In [2]:
get_eigencytokines = function(smoking_status){
    # Separating the cytokine data into compartment dfs
    cytokines <- cytokines %>% 
        filter(Group == smoking_status) %>% #only want non-smokers for baseline analysis
        #background filter eliminating any cytokines that are not expressed
        mutate(ifelse(Conc_pslog2 == 0, NA, Conc_pslog2))

    # reshaping data 
    cytokines <- reshape2::dcast(cytokines, SubjectID ~ Protein, value.var="Conc_pslog2") %>% 
      column_to_rownames("SubjectID") 

    # the scale function operates across columns 
    cytokines_scaled <- cytokines %>% 
      scale() %>% 
      as.data.frame()

    #transpose cytokine data for each compartment
    cytokines <- as.data.frame(t(cytokines))

    #renaming first column, grouping and splitting by "Cluster" column
    NELF_clus <- NELF_clus %>% 
      group_by(Cluster) %>% 
      group_split

    #making dfs for each cluster for PCA analysis 
    NELF_1 <- NELF_clus[[1]]
    NELF_2 <- NELF_clus[[2]]
    NELF_3 <- NELF_clus[[3]]

    #making df with subjects' cytokine concentration data for each cluster 
    NELF_1 <- cytokines %>% 
      rownames_to_column("Cytokine") %>% 
      filter(Cytokine %in% NELF_1$Cytokine) %>% 
      column_to_rownames(var="Cytokine")
    NELF_2 <- cytokines %>% 
      rownames_to_column("Cytokine") %>% 
      filter(Cytokine %in% NELF_2$Cytokine) %>% 
      column_to_rownames(var="Cytokine")
    NELF_3 <- cytokines %>% 
      rownames_to_column("Cytokine") %>% 
      filter(Cytokine %in% NELF_3$Cytokine) %>% 
      column_to_rownames(var="Cytokine")
  
    #PCA on each cluster, eigenvectors are in rotation -- PROBLEM - for some reason had to convert everything to numeric  
    pca_NELF_1 <- NELF_1 %>% 
      lapply(as.numeric) %>% 
      as.data.frame() %>% 
      prcomp()
    pca_NELF_2 <- NELF_2 %>% 
      lapply(as.numeric) %>% 
      as.data.frame() %>% 
      prcomp()
    pca_NELF_3 <- NELF_3 %>% 
      lapply(as.numeric) %>% 
      as.data.frame() %>%   
      prcomp()

    #eigenvector dfs of first principal component 
    eigencytokines_NELF_1 <- data.frame(pca_NELF_1$rotation[,"PC1"])
      colnames(eigencytokines_NELF_1)[1] <- "ClusterA"
    eigencytokines_NELF_2 <- data.frame(pca_NELF_2$rotation[,"PC1"])
      colnames(eigencytokines_NELF_2)[1] <- "ClusterB"
    eigencytokines_NELF_3 <- data.frame(pca_NELF_3$rotation[,"PC1"])
      colnames(eigencytokines_NELF_3)[1] <- "ClusterC"


    #collapse all eigencytokine dfs
    eigencytokines_NELF <- cbind(eigencytokines_NELF_1, eigencytokines_NELF_2, eigencytokines_NELF_3)

    return(eigencytokines_NELF)
}

#calling fn
eigencytokines_NELF_NS = get_eigencytokines("NS")
eigencytokines_NELF_Ecig = get_eigencytokines("Ecig")
eigencytokines_NELF_CS = get_eigencytokines("CS")

In [None]:
#exporting cytokines
eigencytokines_NELF = rbind(eigencytokines_NELF_NS, eigencytokines_NELF_CS, eigencytokines_NELF_Ecig)
write.csv(eigencytokines_NELF, paste0(Output,"/", cur_date, "_NELF_eigencytokines.csv"), row.names = TRUE)

# Wilcoxon Rank Sum tests 
Comparing eigencytokines of non-smokers to smokers.

In [3]:
#converting subject ids to col, melting, and adding compartment
changed_df = function(df, compartment_name){
    df = reshape2::melt(df %>%
        rownames_to_column(var = "SubjectID"), variable = "Cluster",  value.name = 'Conc_pslog2')
    return(df)
}

#calling fn
NS_eigencytokines_NELF = changed_df(eigencytokines_NELF_NS)
Ecig_eigencytokines_NELF = changed_df(eigencytokines_NELF_Ecig)
CS_eigencytokines_NELF = changed_df(eigencytokines_NELF_CS)

Using SubjectID as id variables

Using SubjectID as id variables

Using SubjectID as id variables



In [4]:
#creating vectors to loop through
cluster = unique(NS_eigencytokines_NELF$Cluster)

#initializing vectors to store values
CS_vector = c()
Ecig_vector = c()
wilcoxon_rank_sum_values = function(df1, df2, empty_vector){
    #running wilcoxon rank sum and storing the statistic, cluster, and p value in a vector
    for (j in 1:length(cluster)){
        variable1_df = df1 %>% # baseline df
            filter(Cluster == cluster[j]) %>%
            select(Cluster, Conc_pslog2)
        variable2_df = df2 %>% # smoker df
            filter(Cluster == cluster[j]) %>%
            select(Cluster, Conc_pslog2)


        #running wilcoxon rank sum
        wilcox_test = wilcox.test(variable1_df$Conc_pslog2, variable2_df$Conc_pslog2)

        #calculating absolute difference
        AD = (mean(variable2_df$Conc_pslog2) - mean(variable1_df$Conc_pslog2))#/abs(mean(variable1_df$Conc_pslog2))

        #contains cluster, AD, u stat, p value
        values_vector = c(as.character(cluster[j]), AD, wilcox_test$statistic[[1]], wilcox_test$p.value)
        empty_vector = rbind(empty_vector, values_vector)
        
    }
    return(empty_vector)
}

#calling fn
CS_wilcoxon_values = wilcoxon_rank_sum_values(NS_eigencytokines_NELF, CS_eigencytokines_NELF, CS_vector)
Ecig_wilcoxon_values = wilcoxon_rank_sum_values(NS_eigencytokines_NELF, Ecig_eigencytokines_NELF, Ecig_vector)

In [5]:
final_table = function(matrix){
    #"""
    #Reformatting each matrix into matrices to export. 
    
    #:param: matrix, demographic variable
    #:output: a 6x12 matrix containing compartment, cluster, protein, u stat, p value, p adj

    #"""
    df = data.frame(matrix)
    colnames(df) = c('Cluster', 'AD','Stat', 'P Value')
    
    #going back to calculate padj values
    df$`P Adj` = p.adjust(as.numeric(df$`P Value`), method = "fdr")

    return(df)
}

#calling fn
CS_table = final_table(CS_wilcoxon_values)
CS_table
Ecig_table = final_table(Ecig_wilcoxon_values)
Ecig_table

Unnamed: 0_level_0,Cluster,AD,Stat,P Value,P Adj
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<dbl>
values_vector,ClusterA,-0.0419155253403112,174,0.244760622391352,1
values_vector.1,ClusterB,0.0177247567860667,117,0.436265098719687,1
values_vector.2,ClusterC,-0.0460326755223191,177,0.204489281148627,1


Unnamed: 0_level_0,Cluster,AD,Stat,P Value,P Adj
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<dbl>
values_vector,ClusterA,-0.0304817328054577,154,0.641023611591364,1
values_vector.1,ClusterB,-0.029727648572821,153,0.6659827064215,1
values_vector.2,ClusterC,-0.0331865582775889,139,0.986235536420738,1


In [6]:
#exporting dfs
write.csv(CS_table, paste0(Output,"/", cur_date, "_CS_Distribution_Analysis.csv"), row.names = FALSE)
write.csv(Ecig_table, paste0(Output,"/", cur_date, "_Ecig_Distribution_Analysis.csv"), row.names = FALSE)