In [1]:
Output = ('/Users/alexis/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/CEMALB_DataAnalysisPM/Projects/P1005. Miscellaneous Analyses/P1005.7. EV Proteomics/P1005.7.3. Analyses')
library(readxl)
library(tidyverse)
library(imputeLCMD)
library(factoextra)
library(vegan)
library(preprocessCore)

#reading in files
proteome_df = data.frame(read_excel("Input/EV_function_proteomics_012924.xlsx")) %>%
    rename(Protein = Accession, Unique_Peptides = Number_of_UniquePeptides)

proteome_df$PFAS.CEV_R2 = as.numeric(proteome_df$PFAS.CEV_R2)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: tmvtnorm

Loading required package: mvtnorm

Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked 

In [2]:
head(proteome_df)

Unnamed: 0_level_0,Protein,Unique_Peptides,Control_R1,Control_R2,Control_R3,Control_R4,Control_R5,Control_R6,CEV_R1,CEV_R2,⋯,PFAS.CEV_R3,PFAS.CEV_R4,PFAS.CEV_R5,PFAS.CEV_R6,PFAS.PEV_R1,PFAS.PEV_R2,PFAS.PEV_R3...35,PFAS.PEV_R3...36,PFAS.PEV_R5,PFAS.PEV_R6
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,P00761,13,46631776,51326464,63449032,46508708,32194210,23975652,50760076,39610572,⋯,78471912,61208496,65192580,63247004,49543192,62553512,69856432,92588176,69703720,87109000
2,P60709,44,14898485,14393858,18238848,18135594,16799740,17023620,14949095,15522869,⋯,23687454,21942680,25449002,20706812,22377822,20689196,20683342,23522216,22659790,22152380
3,P0C0S8;Q96KK5;Q99878;Q9BTM1,11,1977650,2275874,7578563,14216659,8436507,10723020,1744173,3018039,⋯,42397020,41915100,63571952,34687048,14175319,12150578,26680958,43462264,49627204,48483936
4,P23527,19,1365104,1798281,5384082,10863599,6436632,7903236,1547090,2131009,⋯,25425400,28165144,37935572,27360386,8919584,10832857,19775876,29996800,34656756,40720276
5,P68104,51,12752274,13042510,11823310,11414176,11093480,10846471,12069089,12141256,⋯,12803181,10858694,11959423,10805138,13487166,12406781,12351198,10652401,10221360,11634231
6,P06733,54,8854008,8850713,10123283,8877717,8250112,8561273,8166417,9454755,⋯,12480849,11187431,12152479,9926687,12584716,9863353,10702597,13051212,11816466,10026787


In [3]:
longer_proteome_df = proteome_df %>%
    pivot_longer(cols = 3:38, names_to = "ID", values_to = "Value") %>%
   # mutate(Treatment = ID) %>%
    separate(ID, c('Treatment', NA), sep = "_", remove = FALSE)

head(longer_proteome_df)

Protein,Unique_Peptides,ID,Treatment,Value
<chr>,<dbl>,<chr>,<chr>,<dbl>
P00761,13,Control_R1,Control,46631776
P00761,13,Control_R2,Control,51326464
P00761,13,Control_R3,Control,63449032
P00761,13,Control_R4,Control,46508708
P00761,13,Control_R5,Control,32194210
P00761,13,Control_R6,Control,23975652


In [4]:
# intial number of proteins prior to filtering
length(unique(proteome_df$Protein))

# 1. Peptide Normalization
Normalize to total amount of peptide per sample: sum the abundance values for each sample, then calculate the median value across all of the summed abundance values, and calculate the ratio

In [5]:
normalized_df = longer_proteome_df %>%
    group_by(ID) %>%
    #removing missing data
    drop_na() %>%
    mutate(Summed_Value = sum(Value)) %>%
    #calculating median across all samples but still within each set
    ungroup() %>%
    mutate(Median_of_Sum = median(Summed_Value), Norm_Factor = Summed_Value/ Median_of_Sum, 
           Norm_Value = Value/Norm_Factor) %>%
    select(-c("Summed_Value", "Median_of_Sum", "Norm_Factor"))

head(normalized_df)

Protein,Unique_Peptides,ID,Treatment,Value,Norm_Value
<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
P00761,13,Control_R1,Control,46631776,48817716
P00761,13,Control_R2,Control,51326464,54279744
P00761,13,Control_R3,Control,63449032,63449032
P00761,13,Control_R4,Control,46508708,46728346
P00761,13,Control_R5,Control,32194210,34552783
P00761,13,Control_R6,Control,23975652,25655376


# 2. Detection Filter - Round 1
Round 1 - Unable to replicate the 1% peptide/ 5% protein FDR, so we'll just retain proteins that were identified by at least 2 peptides.

In [6]:
# only keeping proteins > 2 unique proteins
detection_filter_1_df = longer_proteome_df %>%
    filter(Unique_Peptides > 2)

head(detection_filter_1_df)

length(unique(detection_filter_1_df$Protein))

Protein,Unique_Peptides,ID,Treatment,Value
<chr>,<dbl>,<chr>,<chr>,<dbl>
P00761,13,Control_R1,Control,46631776
P00761,13,Control_R2,Control,51326464
P00761,13,Control_R3,Control,63449032
P00761,13,Control_R4,Control,46508708
P00761,13,Control_R5,Control,32194210
P00761,13,Control_R6,Control,23975652


# 3. Detection Filter - Round 2
Round 2 - Filter rows based on valid values: 50 % valid in each treatment group (i.e. proteins with values observed in fewer than 50% of replicates (here, 3 reps) in at least 1 condition were eliminated from the dataset)

In [7]:
protein_presence_df = detection_filter_1_df %>%
    # creating 2 count cols: 1st for non-missing data, 2nd for all data
    mutate(data_count = ifelse(is.na(Value), 0, 1), all_count = 1) %>%
    group_by(Protein, Treatment) %>%
    summarize(Protein_Presence = sum(data_count)/ sum(all_count) * 100) %>%
    filter(Protein_Presence < 50) %>%
    arrange(Protein_Presence) 

head(protein_presence_df)

# getting the proteins in the entire set that will be removed
removed_proteins_df = protein_presence_df %>%
    ungroup() %>%
    select(-Protein_Presence) %>% 
    unique()

# number of proteins removed from each set for the 2nd detection filter
removed_proteins_df %>%
    summarize(`Removed Proteins` = length(unique(Protein)))

[1m[22m`summarise()` has grouped output by 'Protein'. You can override using the
`.groups` argument.


Protein,Treatment,Protein_Presence
<chr>,<chr>,<dbl>
A3EZ82;Q6E0U4,PFAS,0
A3EZ82;Q6E0U4,PFAS.CEV,0
A3EZ82;Q6E0U4,PFAS.PEV,0
O00762,PFAS,0
O00762,PFAS.PEV,0
O14682,PFAS,0


Removed Proteins
<int>
129


In [8]:
# removing those 129 proteins from the entire dataset 
# finding proteins that exist only in the proteome_df but not the removed proteins_df
detection_filter_2_df = anti_join(detection_filter_1_df, removed_proteins_df)
head(detection_filter_2_df)

# number of proteins kept
detection_filter_2_df %>%
    group_by(Treatment) %>%
    summarize(`Kept Proteins` = length(unique(Protein)))

[1m[22mJoining with `by = join_by(Protein, Treatment)`


Protein,Unique_Peptides,ID,Treatment,Value
<chr>,<dbl>,<chr>,<chr>,<dbl>
P00761,13,Control_R1,Control,46631776
P00761,13,Control_R2,Control,51326464
P00761,13,Control_R3,Control,63449032
P00761,13,Control_R4,Control,46508708
P00761,13,Control_R5,Control,32194210
P00761,13,Control_R6,Control,23975652


Treatment,Kept Proteins
<chr>,<int>
CEV,6699
Control,6694
PEV,6701
PFAS,6659
PFAS.CEV,6633
PFAS.PEV,6643
