In [1]:
setwd('/Users/alexis/IEHS Dropbox/Rager Lab/Alexis_Payton/Experiments/1. Compartment Analysis/1.5. Cluster Distribution Analyses/1.5.1. Wilcoxon Rank Sum/1.5.1.2. Cluster Demographics Distribution Comparison/Input')
Output = '/Users/alexis/IEHS Dropbox/Rager Lab/Alexis_Payton/Experiments/1. Compartment Analysis/1.5. Cluster Distribution Analyses/1.5.1. Wilcoxon Rank Sum/1.5.1.2. Cluster Demographics Distribution Comparison/Output'
curdate = "120821"

library(readxl)
library(reshape2)
library(tidyverse)
library(janitor)

#reading in file
data_df <- data.frame(read_excel("CytokineData_102920.xlsx", sheet = 2))

#reading in demographics file
demographics_data_df = data.frame(read_excel("SubjectInfo_102920.xlsx", sheet = 2))

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.6     [32m✔[39m [34mdplyr  [39m 1.0.4
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test




This is an analysis to determine how if sex plays a role in modifying responses to e-cig or cigarette smoke exposure based on clusters derived amongst all subjects (for consistency).

In [2]:
cytokines = inner_join(data_df, demographics_data_df) %>%
    select(!c(Serum_Cotinine, Ethnicity, Age, BMI)) %>% 
    #filter(Group == 'NS') %>% #only want non-smokers for baseline analysis
    group_by(Sex, Compartment) %>% 
    group_split

M_NLF = cytokines[[6]]
M_NELF = cytokines[[5]]
M_Sputum = cytokines[[8]]
M_Serum = cytokines[[7]]
F_NLF = cytokines[[2]]
F_NELF = cytokines[[1]]
F_Sputum = cytokines[[4]]
F_Serum = cytokines[[3]]

Joining, by = c("SubjectNo", "Group", "SubjectID")



In [3]:
# reshaping data putting subject ids as rows and reshape_data as columns
reshape_data = function(df){
    new_df = reshape2::dcast(df, SubjectID ~ Protein, value.var="Conc_pslog2") %>% 
          column_to_rownames("SubjectID") 
    
    return(new_df)
}

M_NLF = reshape_data(M_NLF)
M_NELF = reshape_data(M_NELF)
M_Sputum = reshape_data(M_Sputum)
M_Serum = reshape_data(M_Serum)
F_NLF = reshape_data(F_NLF)
F_NELF = reshape_data(F_NELF)
F_Sputum = reshape_data(F_Sputum)
F_Serum = reshape_data(F_Serum)

#background filter eliminating any cytokines that are not expressed in a compartment  
M_NLF$I309 <- NULL
M_Sputum$I309 <- NULL
F_NLF$I309 <- NULL
F_Sputum$I309 <- NULL

In [4]:
#need to first transpose to cluster cytokines
tM_NLF = data.frame(t(M_NLF))
tM_NELF = data.frame(t(M_NELF))
tM_Sputum = data.frame(t(M_Sputum))
tM_Serum = data.frame(t(M_Serum))
tF_NLF = data.frame(t(F_NLF))
tF_NELF = data.frame(t(F_NELF))
tF_Sputum = data.frame(t(F_Sputum))
tF_Serum = data.frame(t(F_Serum))

In [5]:
#obtaining clusters from kmeans to get cluster assignments
#previously decided on 3 clusters
get_cluster_assignments = function(tcytokines){
    cytokines_cluster_k3 = kmeans(tcytokines, centers = 3, nstart=25)
    cytokines_k3 <- as.data.frame(cytokines_cluster_k3$cluster) 
    colnames(cytokines_k3)[1] <- "Cluster"
    return(cytokines_k3)
}

#calling fn
M_NLF_cluster_assignments = get_cluster_assignments(tM_NLF)
M_NELF_cluster_assignments = get_cluster_assignments(tM_NELF)
M_Sputum_cluster_assignments = get_cluster_assignments(tM_Sputum)
M_Serum_cluster_assignments = get_cluster_assignments(tM_Serum)
F_NLF_cluster_assignments = get_cluster_assignments(tF_NLF)
F_NELF_cluster_assignments = get_cluster_assignments(tF_NELF)
F_Sputum_cluster_assignments = get_cluster_assignments(tF_Sputum)
F_Serum_cluster_assignments = get_cluster_assignments(tF_Serum)

This gives us cluster assignments specific to each sex, but for continuity we'll use baseline cluster assignments from all subjects.

In [6]:
#cluster assignments for each compartment 
NELF_clus <- data.frame(read_excel("011921NELF_cluster_assignments.xlsx")) 
NLF_clus <- data.frame(read_excel("011921NLF_cluster_assignments.xlsx"))
Serum_clus <- data.frame(read_excel("011921Serum_cluster_assignments.xlsx"))
Sputum_clus <- data.frame(read_excel("011921Sputum_cluster_assignments.xlsx"))

In [7]:
#splitting cluster dfs into separate dfs & df w/ cytokine conc data for each cluster
split_cluster_df = function(cluster_df){
    final_df = cluster_df %>%
      group_by(Cluster) %>% 
      group_split
    
    return(final_df)
}

#calling fn
NLF_clus_1 = split_cluster_df(NLF_clus)[[1]]
NLF_clus_2 = split_cluster_df(NLF_clus)[[2]]
NLF_clus_3 = split_cluster_df(NLF_clus)[[3]]
NELF_clus_1 = split_cluster_df(NELF_clus)[[1]]
NELF_clus_2 = split_cluster_df(NELF_clus)[[2]]
NELF_clus_3 = split_cluster_df(NELF_clus)[[3]]
Sputum_clus_1 = split_cluster_df(Sputum_clus)[[1]]
Sputum_clus_2 = split_cluster_df(Sputum_clus)[[2]]
Sputum_clus_3 = split_cluster_df(Sputum_clus)[[3]]
Serum_clus_1 = split_cluster_df(Serum_clus)[[1]]
Serum_clus_2 = split_cluster_df(Serum_clus)[[2]]
Serum_clus_3 = split_cluster_df(Serum_clus)[[3]]

In [8]:
split_cluster_df = function(reshaped_cytokine_df, clus_df){
    final_df = reshaped_cytokine_df %>% 
      rownames_to_column("Cytokine") %>% 
      filter(Cytokine %in% clus_df$Cytokine) %>% 
      column_to_rownames(var="Cytokine")
    
    return(final_df)
}
#calling fn
M_NLF_clus_1 = split_cluster_df(tM_NLF, NLF_clus_1)
M_NLF_clus_2 = split_cluster_df(tM_NLF, NLF_clus_2)
M_NLF_clus_3 = split_cluster_df(tM_NLF, NLF_clus_3)
M_NELF_clus_1 = split_cluster_df(tM_NELF, NELF_clus_1)
M_NELF_clus_2 = split_cluster_df(tM_NELF, NELF_clus_2)
M_NELF_clus_3 = split_cluster_df(tM_NELF, NELF_clus_3)
M_Sputum_clus_1 = split_cluster_df(tM_Sputum, Sputum_clus_1)
M_Sputum_clus_2 = split_cluster_df(tM_Sputum, Sputum_clus_2)
M_Sputum_clus_3 = split_cluster_df(tM_Sputum, Sputum_clus_3)
M_Serum_clus_1 = split_cluster_df(tM_Serum, Serum_clus_1)
M_Serum_clus_2 = split_cluster_df(tM_Serum, Serum_clus_2)
M_Serum_clus_3 = split_cluster_df(tM_Serum, Serum_clus_3)
F_NLF_clus_1 = split_cluster_df(tF_NLF, NLF_clus_1)
F_NLF_clus_2 = split_cluster_df(tF_NLF, NLF_clus_2)
F_NLF_clus_3 = split_cluster_df(tF_NLF, NLF_clus_3)
F_NELF_clus_1 = split_cluster_df(tF_NELF, NELF_clus_1)
F_NELF_clus_2 = split_cluster_df(tF_NELF, NELF_clus_2)
F_NELF_clus_3 = split_cluster_df(tF_NELF, NELF_clus_3)
F_Sputum_clus_1 = split_cluster_df(tF_Sputum, Sputum_clus_1)
F_Sputum_clus_2 = split_cluster_df(tF_Sputum, Sputum_clus_2)
F_Sputum_clus_3 = split_cluster_df(tF_Sputum, Sputum_clus_3)
F_Serum_clus_1 = split_cluster_df(tF_Serum, Serum_clus_1)
F_Serum_clus_2 = split_cluster_df(tF_Serum, Serum_clus_2)
F_Serum_clus_3 = split_cluster_df(tF_Serum, Serum_clus_3)

In [9]:
#now we can run PCA on each
get_eigenvector = function(df, cluster_name){
    pca = prcomp(df, scale = TRUE, center = TRUE)
    #x = values of each sample in terms of the principal components (ie. each cytokine's contribution to PC)
    #rotation = The relationship (correlation or anticorrelation, etc) between the initial variables and PCs
    pca_eigenvector <- data.frame(pca$rotation[,"PC1"])
    colnames(pca_eigenvector)[1] = cluster_name
            
    return(pca_eigenvector)
}

#calling fn
M_eigencytokines_NLF_1 = get_eigenvector(M_NLF_clus_1, "Cluster A")
M_eigencytokines_NLF_2 = get_eigenvector(M_NLF_clus_2, "Cluster B")
M_eigencytokines_NLF_3 = get_eigenvector(M_NLF_clus_3, "Cluster C")
M_eigencytokines_NELF_1 = get_eigenvector(M_NELF_clus_1, "Cluster A")
M_eigencytokines_NELF_2 = get_eigenvector(M_NELF_clus_2, "Cluster B")
M_eigencytokines_NELF_3 = get_eigenvector(M_NELF_clus_3, "Cluster C")
M_eigencytokines_Sputum_1 = get_eigenvector(M_Sputum_clus_1, "Cluster A")
M_eigencytokines_Sputum_2 = get_eigenvector(M_Sputum_clus_2, "Cluster B")
M_eigencytokines_Sputum_3 = get_eigenvector(M_Sputum_clus_3, "Cluster C")
M_eigencytokines_Serum_1 = get_eigenvector(M_Serum_clus_1, "Cluster A")
M_eigencytokines_Serum_2 = get_eigenvector(M_Serum_clus_2, "Cluster B")
M_eigencytokines_Serum_3 = get_eigenvector(M_Serum_clus_3, "Cluster C")
F_eigencytokines_NLF_1 = get_eigenvector(F_NLF_clus_1, "Cluster A")
F_eigencytokines_NLF_2 = get_eigenvector(F_NLF_clus_2, "Cluster B")
F_eigencytokines_NLF_3 = get_eigenvector(F_NLF_clus_3, "Cluster C")
F_eigencytokines_NELF_1 = get_eigenvector(F_NELF_clus_1, "Cluster A")
F_eigencytokines_NELF_2 = get_eigenvector(F_NELF_clus_2, "Cluster B")
F_eigencytokines_NELF_3 = get_eigenvector(F_NELF_clus_3, "Cluster C")
F_eigencytokines_Sputum_1 = get_eigenvector(F_Sputum_clus_1, "Cluster A")
F_eigencytokines_Sputum_2 = get_eigenvector(F_Sputum_clus_2, "Cluster B")
F_eigencytokines_Sputum_3 = get_eigenvector(F_Sputum_clus_3, "Cluster C")
F_eigencytokines_Serum_1 = get_eigenvector(F_Serum_clus_1, "Cluster A")
F_eigencytokines_Serum_2 = get_eigenvector(F_Serum_clus_2, "Cluster B")
F_eigencytokines_Serum_3 = get_eigenvector(F_Serum_clus_3, "Cluster C")

In [10]:
#collapsing eigencytokines
M_eigencytokines_NLF = cbind(M_eigencytokines_NLF_1, M_eigencytokines_NLF_2, M_eigencytokines_NLF_3)
M_eigencytokines_NELF = cbind(M_eigencytokines_NELF_1, M_eigencytokines_NELF_2, M_eigencytokines_NELF_3)
M_eigencytokines_Sputum = cbind(M_eigencytokines_Sputum_1, M_eigencytokines_Sputum_2, M_eigencytokines_Sputum_3)
M_eigencytokines_Serum = cbind(M_eigencytokines_Serum_1, M_eigencytokines_Serum_2, M_eigencytokines_Serum_3)
F_eigencytokines_NLF = cbind(F_eigencytokines_NLF_1, F_eigencytokines_NLF_2, F_eigencytokines_NLF_3)
F_eigencytokines_NELF = cbind(F_eigencytokines_NELF_1, F_eigencytokines_NELF_2, F_eigencytokines_NELF_3)
F_eigencytokines_Sputum = cbind(F_eigencytokines_Sputum_1, F_eigencytokines_Sputum_2, F_eigencytokines_Sputum_3)
F_eigencytokines_Serum = cbind(F_eigencytokines_Serum_1, F_eigencytokines_Serum_2, F_eigencytokines_Serum_3)

In [11]:
add_compartment = function(dataframe, compartment_name){
    #"""
    #Takes the dataframes already made from above and adds a column specifying the compartment.

    #:param: compartment dataframe and compartment name
    #:output: a matrix containing clustered concentrations. 
    #"""
    dataframe = dataframe %>%
        rownames_to_column(var = 'SubjectID')
    Compartment = rep(compartment_name, times = length(dataframe$SubjectID))
    combined_df = cbind(dataframe,Compartment)
    return(combined_df)
}

#calling fn
M_NLF_df = add_compartment(M_eigencytokines_NLF,'NLF')
M_NELF_df = add_compartment(M_eigencytokines_NELF, 'NELF')
M_Sputum_df = add_compartment(M_eigencytokines_Sputum, 'Sputum')
M_Serum_df = add_compartment(M_eigencytokines_Serum, 'Serum')

F_NLF_df = add_compartment(F_eigencytokines_NLF,'NLF')
F_NELF_df = add_compartment(F_eigencytokines_NELF, 'NELF')
F_Sputum_df = add_compartment(F_eigencytokines_Sputum, 'Sputum')
F_Serum_df = add_compartment(F_eigencytokines_Serum, 'Serum')

In [12]:
#making one df with all these values 
M_initial_clustered_data_df = rbind(M_NLF_df, M_NELF_df, M_Sputum_df, M_Serum_df)
M_clustered_data_df = melt(M_initial_clustered_data_df, variable = 'Cluster', value.name = 'Conc_pslog2')

F_initial_clustered_data_df = rbind(F_NLF_df, F_NELF_df, F_Sputum_df, F_Serum_df)
F_clustered_data_df = melt(F_initial_clustered_data_df, variable = 'Cluster', value.name = 'Conc_pslog2')

#combining data_df and subject info
M_data_df = inner_join(M_clustered_data_df, demographics_data_df) %>%
    select(!c(Serum_Cotinine, Ethnicity, Age, BMI))
F_data_df = inner_join(F_clustered_data_df, demographics_data_df) %>%
    select(!c(Serum_Cotinine, Ethnicity, Age, BMI))
head(M_data_df)

Using SubjectID, Compartment as id variables

Using SubjectID, Compartment as id variables

Joining, by = "SubjectID"

Joining, by = "SubjectID"



Unnamed: 0_level_0,SubjectID,Compartment,Cluster,Conc_pslog2,SubjectNo,Group,Race,Sex
Unnamed: 0_level_1,<chr>,<fct>,<fct>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,CS_23,NLF,Cluster A,0.2127093,23,CS,W,M
2,CS_24,NLF,Cluster A,0.1996138,24,CS,W,M
3,CS_25,NLF,Cluster A,0.1661944,25,CS,W,M
4,CS_26,NLF,Cluster A,0.2195003,26,CS,AA,M
5,CS_27,NLF,Cluster A,0.2111562,27,CS,W,M
6,Ecig_33,NLF,Cluster A,0.2288728,33,Ecig,As,M


Since individual cytokines showed the most differences in male e-cig users and female cigarette smokers, we'll test the same differences in their clustered cytokines.

In [13]:
#creating dfs for each stratified demographic variable
F_NS = F_data_df %>%
    filter(Group == 'NS')
F_CS = F_data_df %>%
    filter(Group == 'CS')
F_Ecig = F_data_df %>%
    filter(Group == 'Ecig')
M_NS = M_data_df %>%
    filter(Group == 'NS')
M_CS = M_data_df %>%
    filter(Group == 'CS')
M_Ecig = M_data_df %>%
    filter(Group == 'Ecig')


#creating vectors to loop through
compartment = c('NLF','NELF','Sputum','Serum')
cluster = unique(M_data_df$Cluster)

In [14]:
#initializing vectors to store values
M_CS_vector = c()
M_Ecig_vector = c()
F_CS_vector = c()
F_Ecig_vector = c()
wilcoxon_rank_sum_values = function(df1, df2, empty_vector){
    #
    for (i in 1:length(compartment)){
        for (j in 1: length(cluster)){
            variable1_df = df1 %>% # baseline df
                filter(Compartment == compartment[i], Cluster == cluster[j]) %>%
                select(Compartment, Cluster, Conc_pslog2)
            variable2_df = df2 %>% # smoker df
                filter(Compartment == compartment[i], Cluster == cluster[j]) %>%
                select(Compartment, Cluster, Conc_pslog2)


            #running wilcoxon rank sum
            wilcox_test = wilcox.test(variable1_df$Conc_pslog2, variable2_df$Conc_pslog2)

            #contains compartment, cluster, u stat, p value
            values_vector = c(compartment[i], cluster[j], wilcox_test$statistic, wilcox_test$p.value)
            empty_vector = c(empty_vector, values_vector)

        }
    }
    return(empty_vector)
}

#calling fn
M_CS_wilcoxon_values = wilcoxon_rank_sum_values(M_NS, M_CS, M_CS_vector)
M_Ecig_wilcoxon_values = wilcoxon_rank_sum_values(M_NS, M_Ecig, M_Ecig_vector)
F_CS_wilcoxon_values = wilcoxon_rank_sum_values(F_NS, F_CS, F_CS_vector)
F_Ecig_wilcoxon_values = wilcoxon_rank_sum_values(F_NS, F_Ecig, F_Ecig_vector)

In [15]:
final_table = function(vector){
    #"""
    #Reformatting each vector into matrices to export. 
    
    #:param: vector, demographic variable
    #:output: a 5x12 matrix containing compartment, cluster, protein, u stat, p value, p adj

    #"""
    dim(vector) = c(4, 12)
    table = data.frame(t(vector))
    colnames(table) = c('Compartment', 'Cluster', 'Stat', 'P Value')
    
    #going back to calculate padj values
    PAdj = c()
    for (i in 1:length(compartment)){
        single_compartment_df = table %>%
            filter(Compartment == compartment[i])
        padj =  p.adjust(single_compartment_df[,4], method = "fdr") 
        PAdj = c(PAdj, padj)
        
    }
    
    table = cbind(table, PAdj)
    return(table)
}

#calling fn
M_CS_table = final_table(M_CS_wilcoxon_values)
M_CS_table
M_Ecig_table = final_table(M_Ecig_wilcoxon_values)
M_Ecig_table
F_CS_table = final_table(F_CS_wilcoxon_values)
F_CS_table
F_Ecig_table = final_table(F_Ecig_wilcoxon_values)
F_Ecig_table

Compartment,Cluster,Stat,P Value,PAdj
<fct>,<fct>,<fct>,<fct>,<dbl>
NLF,1,15,0.755050505050505,1
NLF,2,18,1.0,1
NLF,3,16,0.876262626262626,1
NELF,1,22,0.53030303030303,1
NELF,2,22,0.53030303030303,1
NELF,3,19,0.876262626262626,1
Sputum,1,25,0.267676767676768,1
Sputum,2,23,0.431818181818182,1
Sputum,3,16,0.876262626262626,1
Serum,1,13,0.53030303030303,1


Compartment,Cluster,Stat,P Value,PAdj
<fct>,<fct>,<fct>,<fct>,<dbl>
NLF,1,39,0.836945304437564,1
NLF,2,67,0.035841867111217,1
NLF,3,44,0.9018020163531,1
NELF,1,39,0.836945304437564,1
NELF,2,42,1.0,1
NELF,3,43,0.967135032150512,1
Sputum,1,24,0.142216400730333,1
Sputum,2,41,0.967135032150512,1
Sputum,3,11,0.0071842502183059,1
Serum,1,56,0.261411447169961,1


Compartment,Cluster,Stat,P Value,PAdj
<fct>,<fct>,<fct>,<fct>,<dbl>
NLF,1,26,0.866511266511267,1
NLF,2,34,0.535819735819736,1
NLF,3,41,0.151981351981352,1
NELF,1,29,0.955089355089355,1
NELF,2,30,0.866511266511267,1
NELF,3,34,0.535819735819736,1
Sputum,1,19,0.335664335664336,1
Sputum,2,26,0.866511266511267,1
Sputum,3,33,0.612587412587413,1
Serum,1,17,0.231857031857032,1


Compartment,Cluster,Stat,P Value,PAdj
<fct>,<fct>,<fct>,<fct>,<dbl>
NLF,1,16,0.876262626262626,1
NLF,2,21,0.638888888888889,1
NLF,3,34,0.005050505050505,1
NELF,1,21,0.638888888888889,1
NELF,2,18,1.0,1
NELF,3,24,0.343434343434343,1
Sputum,1,19,0.876262626262626,1
Sputum,2,10,0.267676767676768,1
Sputum,3,24,0.343434343434343,1
Serum,1,6,0.0732323232323232,1
