---
title: Required inputs and parameters
jupyter: python3
---

In [1]:
from utils import *
from barcode_funcs import *

# warnings.filterwarnings("ignore")

from IPython.display import display
plt.rcParams['figure.figsize'] = [7, 5]
# plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

import pingouin as pg

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# lip as prefix
# because we are going to have ptm data as well, they are different in terms of some parameters

id_separator = '@'  # same
sig_thr = 0.05 # same
sig_type = "pval" # same
missing_thr = 0.5 # same
min_pept_count = 2 # unique to lip
search_tool = "FragPipe" # same

In [4]:
experiment_type = "PnT1" # same
sample = "Cyano" # same
data_type = "LiP" # same 
working_dir = "."

data_dir = f"{os.path.abspath(working_dir)}/data/{experiment_type}/{sample}/{data_type}" # same
result_dir = f"{os.path.abspath(working_dir)}/results/original_{experiment_type}/{sample}" # same
Path(result_dir).mkdir(parents=True, exist_ok=True)

In [5]:
fasta_file = f"{data_dir}/uniprotkb_proteome_UP000889800_2025_02_10.fasta"
prot_seq_obj = SeqIO.parse(fasta_file, "fasta")
prot_seqs = [seq_item for seq_item in prot_seq_obj]
len(prot_seqs)

2657

In [6]:
metadata_file = f"{data_dir}/metadata_lip.txt"   # unique to lip
metadata_ori = pd.read_csv(f"{data_dir}/metadata.csv")
metadata_ori

Unnamed: 0,Sample,Density,Condition,Digestion,Replicate,Group
0,Dense_Light_T1,Dense,Light,Trypsin_only,1,Dense_Light
1,Dense_Light_T2,Dense,Light,Trypsin_only,2,Dense_Light
2,Dense_Light_T3,Dense,Light,Trypsin_only,3,Dense_Light
3,Dense_Light_T4,Dense,Light,Trypsin_only,4,Dense_Light
4,Dense_Light_T5,Dense,Light,Trypsin_only,5,Dense_Light
5,Dilute_Light_T1,Dilute,Light,Trypsin_only,1,Dilute_Light
6,Dilute_Light_T2,Dilute,Light,Trypsin_only,2,Dilute_Light
7,Dilute_Light_T3,Dilute,Light,Trypsin_only,3,Dilute_Light
8,Dilute_Light_T4,Dilute,Light,Trypsin_only,4,Dilute_Light
9,Dilute_Light_T5,Dilute,Light,Trypsin_only,5,Dilute_Light


In [7]:
print(metadata_ori.columns.to_list())

['Sample', 'Density', 'Condition', 'Digestion', 'Replicate', 'Group']


In [8]:
# metadata = metadata_ori.iloc[:20,:].copy()
metadata_lip = metadata_ori.copy()
metadata_lip["SampleNames"] = metadata_lip["Sample"]
metadata_lip["LiP"] = ["ProK" if "_DD" in x else "Tryp" for x in metadata_lip["SampleNames"]]
metadata_lip["Treatment"] = metadata_lip["Condition"]
metadata_lip["Condition"] = ["Treatment" if x == "Dark" else "Control" for x in metadata_lip["Treatment"]] 
metadata_lip["Sample"] = metadata_lip["SampleNames"].apply(lambda x: '_'.join(x.split('_')[:2] + [x.split('_')[-1][-1]])) 
metadata_lip["Replicate"] = metadata_lip["SampleNames"].apply(lambda x: int(x.split('_')[-1][-1]))
metadata_lip["Dose"] = metadata_lip["Density"]
metadata_lip["Time"] = np.nan
metadata_lip["Group"] = ['_'.join(x.split('_')[:2]) for x in metadata_lip["Sample"]]
metadata_lip

Unnamed: 0,Sample,Density,Condition,Digestion,Replicate,Group,SampleNames,LiP,Treatment,Dose,Time
0,Dense_Light_1,Dense,Control,Trypsin_only,1,Dense_Light,Dense_Light_T1,Tryp,Light,Dense,
1,Dense_Light_2,Dense,Control,Trypsin_only,2,Dense_Light,Dense_Light_T2,Tryp,Light,Dense,
2,Dense_Light_3,Dense,Control,Trypsin_only,3,Dense_Light,Dense_Light_T3,Tryp,Light,Dense,
3,Dense_Light_4,Dense,Control,Trypsin_only,4,Dense_Light,Dense_Light_T4,Tryp,Light,Dense,
4,Dense_Light_5,Dense,Control,Trypsin_only,5,Dense_Light,Dense_Light_T5,Tryp,Light,Dense,
5,Dilute_Light_1,Dilute,Control,Trypsin_only,1,Dilute_Light,Dilute_Light_T1,Tryp,Light,Dilute,
6,Dilute_Light_2,Dilute,Control,Trypsin_only,2,Dilute_Light,Dilute_Light_T2,Tryp,Light,Dilute,
7,Dilute_Light_3,Dilute,Control,Trypsin_only,3,Dilute_Light,Dilute_Light_T3,Tryp,Light,Dilute,
8,Dilute_Light_4,Dilute,Control,Trypsin_only,4,Dilute_Light,Dilute_Light_T4,Tryp,Light,Dilute,
9,Dilute_Light_5,Dilute,Control,Trypsin_only,5,Dilute_Light,Dilute_Light_T5,Tryp,Light,Dilute,


In [9]:
#metadata_lip.to_csv(metadata_file, sep='\t', index=None)
metadata_lip.to_csv("meta.tsv", sep='\t', index=None)

In [10]:
#metadata_lip = pd.read_csv(metadata_file, sep='\t')
metadata_lip

Unnamed: 0,Sample,Density,Condition,Digestion,Replicate,Group,SampleNames,LiP,Treatment,Dose,Time
0,Dense_Light_1,Dense,Control,Trypsin_only,1,Dense_Light,Dense_Light_T1,Tryp,Light,Dense,
1,Dense_Light_2,Dense,Control,Trypsin_only,2,Dense_Light,Dense_Light_T2,Tryp,Light,Dense,
2,Dense_Light_3,Dense,Control,Trypsin_only,3,Dense_Light,Dense_Light_T3,Tryp,Light,Dense,
3,Dense_Light_4,Dense,Control,Trypsin_only,4,Dense_Light,Dense_Light_T4,Tryp,Light,Dense,
4,Dense_Light_5,Dense,Control,Trypsin_only,5,Dense_Light,Dense_Light_T5,Tryp,Light,Dense,
5,Dilute_Light_1,Dilute,Control,Trypsin_only,1,Dilute_Light,Dilute_Light_T1,Tryp,Light,Dilute,
6,Dilute_Light_2,Dilute,Control,Trypsin_only,2,Dilute_Light,Dilute_Light_T2,Tryp,Light,Dilute,
7,Dilute_Light_3,Dilute,Control,Trypsin_only,3,Dilute_Light,Dilute_Light_T3,Tryp,Light,Dilute,
8,Dilute_Light_4,Dilute,Control,Trypsin_only,4,Dilute_Light,Dilute_Light_T4,Tryp,Light,Dilute,
9,Dilute_Light_5,Dilute,Control,Trypsin_only,5,Dilute_Light,Dilute_Light_T5,Tryp,Light,Dilute,


In [11]:
trypsin_pept_file = f"{data_dir}/Trypsin_only/combined_peptide.tsv"   # unique to lip
trypsin_prot_file = f"{data_dir}/Trypsin_only/combined_protein.tsv" # unique to lip
double_pept_file = f"{data_dir}/Double_digest/combined_peptide.tsv" # unique to lip
# double_prot_file = f"{data_dir}/Double_digest/combined_protein.tsv" # unique to lip

In [12]:
trypsin_pept = pd.read_table(trypsin_pept_file, sep='\t')
trypsin_prot = pd.read_csv(trypsin_prot_file, sep='\t')
double_pept = pd.read_csv(double_pept_file, sep='\t')
# double_prot = pd.read_csv(double_prot_file, sep='\t')

In [13]:
double_pept_file

'/Users/shou626/workspace/lipms/Comparison2/data/PnT1/Cyano/LiP/Double_digest/combined_peptide.tsv'

In [14]:
#print(double_pept.columns.to_list())
#print(trypsin_pept.columns.to_list())
#print(trypsin_prot.columns.to_list())

In [15]:
if search_tool.lower() == "maxquant":
    quant_type = "LFQ intensity" # Intensity or LFQ intensity
elif search_tool.lower() == "msfragger" or search_tool.lower() == "fragpipe":
    quant_type = "MaxLFQ Intensity" # Intensity or MaxLFQ Intensity

In [16]:
#trypsin_sample_dict = pd.Series(metadata["Sample"].values, index=metadata["Trypsin_only_samples"]).to_dict()
#double_sample_dict = pd.Series(metadata["Sample"].values, index=metadata["Double_digest_samples"]).to_dict()
metadata_lip.index = metadata_lip["SampleNames"].to_list()

trypsin_pept.columns = [metadata_lip.loc[x.split()[0], "Sample"] if f"{quant_type}" in x else x for x in trypsin_pept.columns]
double_pept.columns = [metadata_lip.loc[x.split()[0], "Sample"] if f"{quant_type}" in x else x for x in double_pept.columns]
trypsin_prot.columns = [metadata_lip.loc[x.split()[0], "Sample"] if f"{quant_type}" in x else x for x in trypsin_prot.columns]

In [17]:
print(double_pept.columns.to_list())
print(trypsin_pept.columns.to_list())
print(trypsin_prot.columns.to_list())

['Peptide Sequence', 'Prev AA', 'Next AA', 'Start', 'End', 'Peptide Length', 'Charges', 'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Mapped Genes', 'Mapped Proteins', 'Dense_Light_DD1 Spectral Count', 'Dense_Light_DD2 Spectral Count', 'Dense_Light_DD3 Spectral Count', 'Dense_Light_DD4 Spectral Count', 'Dense_Light_DD5 Spectral Count', 'Dilute_Light_DD1 Spectral Count', 'Dilute_Light_DD2 Spectral Count', 'Dilute_Light_DD3 Spectral Count', 'Dilute_Light_DD4 Spectral Count', 'Dilute_Light_DD5 Spectral Count', 'Dense_Light_DD1 Intensity', 'Dense_Light_DD2 Intensity', 'Dense_Light_DD3 Intensity', 'Dense_Light_DD4 Intensity', 'Dense_Light_DD5 Intensity', 'Dilute_Light_DD1 Intensity', 'Dilute_Light_DD2 Intensity', 'Dilute_Light_DD3 Intensity', 'Dilute_Light_DD4 Intensity', 'Dilute_Light_DD5 Intensity', 'Dense_Light_1', 'Dense_Light_2', 'Dense_Light_3', 'Dense_Light_4', 'Dense_Light_5', 'Dilute_Light_1', 'Dilute_Light_2', 'Dilute_Light_3', 'Dilute_Light_4', 'Dilute_Li

In [18]:
metadata = metadata_lip[metadata_lip["LiP"] == "Tryp"].copy()
metadata["Trypsin_only_samples"] = metadata_lip[metadata_lip["LiP"] == "Tryp"]["SampleNames"]
metadata["Double_digest_samples"] = metadata_lip[metadata_lip["LiP"] == "ProK"]["SampleNames"]

In [19]:
pairwise_factor = "Density"
print(metadata[pairwise_factor].unique().tolist())
pairwise_pars = metadata[pairwise_factor].unique()
anova_factors = ["Treatment", pairwise_factor]

['Dense', 'Dilute']


In [20]:
metadata[pairwise_factor]

Dense_Light_T1      Dense
Dense_Light_T2      Dense
Dense_Light_T3      Dense
Dense_Light_T4      Dense
Dense_Light_T5      Dense
Dilute_Light_T1    Dilute
Dilute_Light_T2    Dilute
Dilute_Light_T3    Dilute
Dilute_Light_T4    Dilute
Dilute_Light_T5    Dilute
Name: Density, dtype: object

In [21]:
metadata

Unnamed: 0,Sample,Density,Condition,Digestion,Replicate,Group,SampleNames,LiP,Treatment,Dose,Time,Trypsin_only_samples,Double_digest_samples
Dense_Light_T1,Dense_Light_1,Dense,Control,Trypsin_only,1,Dense_Light,Dense_Light_T1,Tryp,Light,Dense,,Dense_Light_T1,
Dense_Light_T2,Dense_Light_2,Dense,Control,Trypsin_only,2,Dense_Light,Dense_Light_T2,Tryp,Light,Dense,,Dense_Light_T2,
Dense_Light_T3,Dense_Light_3,Dense,Control,Trypsin_only,3,Dense_Light,Dense_Light_T3,Tryp,Light,Dense,,Dense_Light_T3,
Dense_Light_T4,Dense_Light_4,Dense,Control,Trypsin_only,4,Dense_Light,Dense_Light_T4,Tryp,Light,Dense,,Dense_Light_T4,
Dense_Light_T5,Dense_Light_5,Dense,Control,Trypsin_only,5,Dense_Light,Dense_Light_T5,Tryp,Light,Dense,,Dense_Light_T5,
Dilute_Light_T1,Dilute_Light_1,Dilute,Control,Trypsin_only,1,Dilute_Light,Dilute_Light_T1,Tryp,Light,Dilute,,Dilute_Light_T1,
Dilute_Light_T2,Dilute_Light_2,Dilute,Control,Trypsin_only,2,Dilute_Light,Dilute_Light_T2,Tryp,Light,Dilute,,Dilute_Light_T2,
Dilute_Light_T3,Dilute_Light_3,Dilute,Control,Trypsin_only,3,Dilute_Light,Dilute_Light_T3,Tryp,Light,Dilute,,Dilute_Light_T3,
Dilute_Light_T4,Dilute_Light_4,Dilute,Control,Trypsin_only,4,Dilute_Light,Dilute_Light_T4,Tryp,Light,Dilute,,Dilute_Light_T4,
Dilute_Light_T5,Dilute_Light_5,Dilute,Control,Trypsin_only,5,Dilute_Light,Dilute_Light_T5,Tryp,Light,Dilute,,Dilute_Light_T5,


In [22]:
control_groups = list(metadata[metadata["Condition"]=="Control"]["Group"].unique())
control_group_cols = [metadata[metadata["Group"] == group]["Sample"].to_list() for group in control_groups]
treat_groups = list(metadata[metadata["Condition"]=="Treatment"]["Group"].unique())
treat_group_cols = [metadata[metadata["Group"] == group]["Sample"].to_list() for group in treat_groups]
groups = list(metadata["Group"].unique())
group_cols = [metadata[metadata["Group"] == group]["Sample"].to_list() for group in groups]
all_groups = list(metadata["Group"].unique())
all_group_cols = [metadata[metadata["Group"] == group]["Sample"].to_list() for group in all_groups]
int_cols = metadata["Sample"].to_list()

anova_cols = int_cols

pairwise_ttest_groups = []
for par in pairwise_pars:
    for control_group in list(set(metadata[(metadata["Condition"]=="Control") & (metadata[pairwise_factor] == par)]["Group"])):
        for treat_group in list(set(metadata[(metadata["Condition"]=="Treatment") & (metadata[pairwise_factor] == par)]["Group"])):
            pairwise_ttest_groups.append([f"{treat_group}/{control_group}", control_group, treat_group, metadata[metadata["Group"] == control_group]["Sample"].to_list(), metadata[metadata["Group"] == treat_group]["Sample"].to_list()])

dose_pairwise_ttest_groups = []
dose_ctrl_group = 'Dense_Light'
#for dose_treat_group in ['Dense_Dark']:
for dose_treat_group in ['Dilute_Light']:
    dose_pairwise_ttest_groups.append([f"{dose_treat_group}/{dose_ctrl_group}", dose_ctrl_group, dose_treat_group, metadata[metadata["Group"] == dose_ctrl_group]["Sample"].to_list(), metadata[metadata["Group"] == dose_treat_group]["Sample"].to_list()])
#dose_ctrl_group = 'Dense_Dark'
#for dose_treat_group in ['Dilute_Dark']:
#    dose_pairwise_ttest_groups.append([f"{dose_treat_group}/{dose_ctrl_group}", dose_ctrl_group, dose_treat_group, metadata[metadata["Group"] == dose_ctrl_group]["Sample"].to_list(), metadata[metadata["Group"] == dose_treat_group]["Sample"].to_list()])

stats_cols = ["Total missingness"] + [f"{group} missingness" for group in groups] + [pairwise_ttest_group[0] for pairwise_ttest_group in pairwise_ttest_groups] + [f"{pairwise_ttest_group[0]}_pval" for pairwise_ttest_group in pairwise_ttest_groups] + [f"{pairwise_ttest_group[0]}_adj-p" for pairwise_ttest_group in pairwise_ttest_groups] + [dose_pairwise_ttest_group[0] for dose_pairwise_ttest_group in dose_pairwise_ttest_groups] + [f"{dose_pairwise_ttest_group[0]}_pval" for dose_pairwise_ttest_group in dose_pairwise_ttest_groups] + [f"{dose_pairwise_ttest_group[0]}_adj-p" for dose_pairwise_ttest_group in dose_pairwise_ttest_groups]

if len(groups) > 2:
    if len(anova_factors) < 1 or 'anova_factors' not in locals() or 'anova_factors' not in globals():
        anova_factors = ["Group"]
    anova_factor_names = [f"{anova_factors[i]} * {anova_factors[j]}" if i != j else f"{anova_factors[i]}" for i in range(len(anova_factors)) for j in range(i, len(anova_factors))]
    stats_cols += [f"ANOVA_[{anova_factor_name}]_pval" for anova_factor_name in anova_factor_names] 
    stats_cols += [f"ANOVA_[{anova_factor_name}]_adj-p" for anova_factor_name in anova_factor_names] 

In [23]:
groups

['Dense_Light', 'Dilute_Light']

In [24]:
metadata["Sample"].to_list()

['Dense_Light_1',
 'Dense_Light_2',
 'Dense_Light_3',
 'Dense_Light_4',
 'Dense_Light_5',
 'Dilute_Light_1',
 'Dilute_Light_2',
 'Dilute_Light_3',
 'Dilute_Light_4',
 'Dilute_Light_5']

In [25]:
dose_pairwise_ttest_groups

[['Dilute_Light/Dense_Light',
  'Dense_Light',
  'Dilute_Light',
  ['Dense_Light_1',
   'Dense_Light_2',
   'Dense_Light_3',
   'Dense_Light_4',
   'Dense_Light_5'],
  ['Dilute_Light_1',
   'Dilute_Light_2',
   'Dilute_Light_3',
   'Dilute_Light_4',
   'Dilute_Light_5']]]

In [26]:
if search_tool.lower() == "maxquant":
    id_col = 'id'
    uniprot_col = "UniProt"
    protein_col = "Protein names"
    peptide_col = "Sequence"
    site_col = "Site"
    residue_col = "Residue"
    type_col = "Type"
    experiment_col = "Experiment"
    site_number_col = "site_number"
    prot_info_cols = [id_col, uniprot_col, protein_col, site_col, residue_col, type_col, experiment_col, site_number_col] 
    prot_info_stats_cols = prot_info_cols + int_cols + stats_cols 
    pept_info_cols = [id_col, uniprot_col, protein_col, site_col, residue_col, type_col, experiment_col, site_number_col]
    pept_info_stats_cols = pept_info_cols + int_cols + stats_cols
elif search_tool.lower() == "msfragger" or search_tool.lower() == "fragpipe":
    id_col = 'id'
    uniprot_col = "UniProt"
    protein_col = "Protein"
    peptide_col = "Peptide Sequence"
    site_col = "Site"
    residue_col = "Residue"
    type_col = "Type"
    experiment_col = "Experiment"
    site_number_col = "site_number"
    prot_info_cols = [id_col, uniprot_col, protein_col, site_col, residue_col, type_col, experiment_col, site_number_col] 
    prot_info_stats_cols = prot_info_cols + int_cols + stats_cols 
    pept_info_cols = [id_col, uniprot_col, protein_col, site_col, residue_col, type_col, experiment_col, site_number_col]
    pept_info_stats_cols = pept_info_cols + int_cols + stats_cols
else:
    raise NotImplementedError("Not implemented for this search tool")

In [27]:
ProteinID_col_prot = None # unique to lip
ProteinID_col_pept = None # unique to lip
PeptCounts_col = None # unique to lip

if ProteinID_col_pept is not None and ProteinID_col_prot is not None and PeptCounts_col is not None:
    pass
elif search_tool.lower() == "maxquant":
    ProteinID_col_prot = "Majority protein IDs"
    ProteinID_col_pept = "Leading razor protein"
    PeptCounts_col = "Peptide counts (all)"
elif search_tool.lower() == "msfragger" or search_tool.lower() == "fragpipe":
    ProteinID_col_prot = "Protein"
    ProteinID_col_pept = "Protein"
    PeptCounts_col = "Combined Total Peptides"
else:
    raise NotImplementedError("The error was triggered because either the search tool is not specified or not columns specification are not provided. Please specify the search tool or provide the columns for protein IDs in both protein table and peptide table as well as peptide counts.")

In [28]:
pairwise_ttest_groups

[]

In [29]:
groups

['Dense_Light', 'Dilute_Light']

#################################################################################
#################################################################################

# The analysis pipeline

## Pre-processing the data


In [30]:
print(double_pept.columns.to_list())
print(trypsin_pept.columns.to_list())
print(trypsin_prot.columns.to_list())

['Peptide Sequence', 'Prev AA', 'Next AA', 'Start', 'End', 'Peptide Length', 'Charges', 'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Mapped Genes', 'Mapped Proteins', 'Dense_Light_DD1 Spectral Count', 'Dense_Light_DD2 Spectral Count', 'Dense_Light_DD3 Spectral Count', 'Dense_Light_DD4 Spectral Count', 'Dense_Light_DD5 Spectral Count', 'Dilute_Light_DD1 Spectral Count', 'Dilute_Light_DD2 Spectral Count', 'Dilute_Light_DD3 Spectral Count', 'Dilute_Light_DD4 Spectral Count', 'Dilute_Light_DD5 Spectral Count', 'Dense_Light_DD1 Intensity', 'Dense_Light_DD2 Intensity', 'Dense_Light_DD3 Intensity', 'Dense_Light_DD4 Intensity', 'Dense_Light_DD5 Intensity', 'Dilute_Light_DD1 Intensity', 'Dilute_Light_DD2 Intensity', 'Dilute_Light_DD3 Intensity', 'Dilute_Light_DD4 Intensity', 'Dilute_Light_DD5 Intensity', 'Dense_Light_1', 'Dense_Light_2', 'Dense_Light_3', 'Dense_Light_4', 'Dense_Light_5', 'Dilute_Light_1', 'Dilute_Light_2', 'Dilute_Light_3', 'Dilute_Light_4', 'Dilute_Li

In [31]:
print(trypsin_pept.shape)
print(double_pept.shape)
print(trypsin_prot.shape)

(38829, 54)
(49010, 54)
(2021, 65)


In [32]:
double_pept = filter_contaminants_reverse_pept(double_pept, search_tool, ProteinID_col_pept, uniprot_col)
trypsin_pept = filter_contaminants_reverse_pept(trypsin_pept, search_tool, ProteinID_col_pept, uniprot_col)
trypsin_prot = filter_contaminants_reverse_prot(trypsin_prot, search_tool, ProteinID_col_prot, uniprot_col)

In [33]:
print(trypsin_pept.shape)
print(double_pept.shape)
print(trypsin_prot.shape)

(38829, 55)
(49010, 55)
(2021, 66)


In [34]:
double_pept = double_pept[double_pept[ProteinID_col_pept].str.contains("_SYNE7")]
trypsin_pept = trypsin_pept[trypsin_pept[ProteinID_col_pept].str.contains("_SYNE7")]
trypsin_prot = trypsin_prot[trypsin_prot[ProteinID_col_prot].str.contains("_SYNE7")]

In [35]:
print(trypsin_pept.shape)
print(double_pept.shape)
print(trypsin_prot.shape)

(38438, 55)
(48573, 55)
(1994, 66)


In [36]:
trypsin_prot[uniprot_col] = trypsin_prot[ProteinID_col_prot].str.split("|").str[1]
trypsin_prot[protein_col] = trypsin_prot[ProteinID_col_prot].str.split("|").str[2]
trypsin_prot

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Length,Organism,Protein Existence,Description,Protein Probability,Top Peptide Probability,Combined Total Peptides,Combined Spectral Count,Combined Unique Spectral Count,Combined Total Spectral Count,Dense_Light_T1 Spectral Count,Dense_Light_T2 Spectral Count,Dense_Light_T3 Spectral Count,Dense_Light_T4 Spectral Count,Dense_Light_T5 Spectral Count,Dilute_Light_T1 Spectral Count,Dilute_Light_T2 Spectral Count,Dilute_Light_T3 Spectral Count,Dilute_Light_T4 Spectral Count,Dilute_Light_T5 Spectral Count,Dense_Light_T1 Unique Spectral Count,Dense_Light_T2 Unique Spectral Count,Dense_Light_T3 Unique Spectral Count,Dense_Light_T4 Unique Spectral Count,Dense_Light_T5 Unique Spectral Count,Dilute_Light_T1 Unique Spectral Count,Dilute_Light_T2 Unique Spectral Count,Dilute_Light_T3 Unique Spectral Count,Dilute_Light_T4 Unique Spectral Count,Dilute_Light_T5 Unique Spectral Count,Dense_Light_T1 Total Spectral Count,Dense_Light_T2 Total Spectral Count,Dense_Light_T3 Total Spectral Count,Dense_Light_T4 Total Spectral Count,Dense_Light_T5 Total Spectral Count,Dilute_Light_T1 Total Spectral Count,Dilute_Light_T2 Total Spectral Count,Dilute_Light_T3 Total Spectral Count,Dilute_Light_T4 Total Spectral Count,Dilute_Light_T5 Total Spectral Count,Dense_Light_T1 Intensity,Dense_Light_T2 Intensity,Dense_Light_T3 Intensity,Dense_Light_T4 Intensity,Dense_Light_T5 Intensity,Dilute_Light_T1 Intensity,Dilute_Light_T2 Intensity,Dilute_Light_T3 Intensity,Dilute_Light_T4 Intensity,Dilute_Light_T5 Intensity,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Indistinguishable Proteins,UniProt
0,RRP3_SYNE7,O05161,RRP3_SYNE7,mut3G,112,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable small ribosomal subunit protein cS23,1.0000,0.999,10,132,132,132,3,3,2,1,4,12,8,9,11,13,3,3,2,1,4,12,8,9,11,13,3,3,2,1,4,12,8,9,11,13,3.760000e+08,3.810000e+08,2.730000e+08,2.820000e+08,3.970000e+08,3.370000e+09,2.740000e+09,2.760000e+09,2.220000e+09,2.780000e+09,2.330000e+08,2.290000e+08,1.670000e+08,1.750000e+08,2.060000e+08,1.030000e+09,8.610000e+08,8.660000e+08,8.330000e+08,8.360000e+08,,O05161
1,AMPA_SYNE7,O06865,AMPA_SYNE7,pepA,486,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable cytosol aminopeptidase,1.0000,0.999,95,2318,2318,2318,118,115,103,107,130,124,127,123,119,109,118,115,103,107,130,124,127,123,119,109,118,115,103,107,130,124,127,123,119,109,2.980000e+10,2.430000e+10,2.830000e+10,2.500000e+10,2.520000e+10,3.350000e+10,3.370000e+10,3.190000e+10,2.910000e+10,3.130000e+10,2.520000e+09,2.420000e+09,2.620000e+09,2.390000e+09,2.490000e+09,3.300000e+09,3.470000e+09,3.150000e+09,3.230000e+09,3.170000e+09,,O06865
2,MOBA_SYNE7,O06866,MOBA_SYNE7,mobA,194,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable molybdenum cofactor guanylyltransferase,1.0000,0.999,3,32,32,32,3,1,1,2,0,2,2,1,0,2,3,1,1,2,0,2,2,1,0,2,3,1,1,2,0,2,2,1,0,2,7.320000e+07,2.550000e+07,2.200000e+07,5.890000e+07,0.000000e+00,4.280000e+07,3.550000e+07,2.730000e+07,0.000000e+00,3.020000e+07,3.110000e+07,0.000000e+00,0.000000e+00,3.240000e+07,0.000000e+00,2.360000e+07,1.950000e+07,1.500000e+07,0.000000e+00,1.660000e+07,,O06866
3,CHLD_SYNE7,O07345,CHLD_SYNE7,chlD,677,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Magnesium-chelatase subunit ChlD,1.0000,0.999,19,112,112,112,9,6,7,9,4,6,4,4,2,3,9,6,7,9,4,6,4,4,2,3,9,6,7,9,4,6,4,4,2,3,3.050000e+08,2.100000e+08,2.300000e+08,2.490000e+08,4.620000e+07,5.720000e+07,4.740000e+07,6.080000e+07,3.160000e+07,2.910000e+07,7.380000e+07,5.400000e+07,5.590000e+07,5.900000e+07,1.820000e+07,2.610000e+07,3.320000e+07,2.960000e+07,2.820000e+07,0.000000e+00,,O07345
4,GSHB_SYNE7,O32463,GSHB_SYNE7,gshB,323,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Glutathione synthetase,1.0000,0.999,47,664,664,664,33,31,34,27,38,36,34,36,29,32,33,31,34,27,38,36,34,36,29,32,33,31,34,27,38,36,34,36,29,32,1.910000e+10,3.930000e+09,4.240000e+09,4.290000e+09,4.770000e+09,6.000000e+09,5.770000e+09,5.880000e+09,8.540000e+09,5.640000e+09,7.060000e+08,7.200000e+08,8.600000e+08,8.240000e+08,8.720000e+08,1.210000e+09,1.140000e+09,1.140000e+09,1.270000e+09,1.090000e+09,,O32463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,Q9WWL7_SYNE7,Q9WWL7,Q9WWL7_SYNE7,gap3,333,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Glyceraldehyde-3-phosphate dehydrogenase,0.9998,0.999,6,28,28,53,1,0,2,2,2,0,1,1,2,3,1,0,2,2,2,0,1,1,2,3,3,2,3,3,4,1,2,3,4,3,5.880000e+07,0.000000e+00,5.060000e+07,1.350000e+08,1.580000e+08,2.770000e+07,2.130000e+07,2.280000e+08,4.000000e+07,2.510000e+08,0.000000e+00,0.000000e+00,0.000000e+00,5.820000e+07,6.750000e+07,0.000000e+00,0.000000e+00,1.210000e+08,5.970000e+07,1.030000e+08,,Q9WWL7
2017,Q9WWQ0_SYNE7,Q9WWQ0,Q9WWQ0_SYNE7,cpmA,260,Synechococcus elongatus (strain ATCC 33912 /,4:Protein predicted,Circadian phase modifier,1.0000,0.999,7,91,91,91,6,7,7,5,6,2,4,2,2,4,6,7,7,5,6,2,4,2,2,4,6,7,7,5,6,2,4,2,2,4,4.010000e+08,4.480000e+08,4.410000e+08,4.160000e+08,4.200000e+08,1.000000e+08,1.850000e+08,1.590000e+08,9.720000e+07,1.770000e+08,9.640000e+07,9.920000e+07,1.050000e+08,1.070000e+08,9.090000e+07,5.910000e+07,6.090000e+07,5.930000e+07,5.720000e+07,6.130000e+07,,Q9WWQ0
2018,Q9Z3G2_SYNE7,Q9Z3G2,Q9Z3G2_SYNE7,cytM,145,Synechococcus elongatus (strain ATCC 33912 /,4:Protein predicted,CytM,1.0000,0.999,3,10,10,10,0,0,0,2,0,0,1,0,1,2,0,0,0,2,0,0,1,0,1,2,0,0,0,2,0,0,1,0,1,2,0.000000e+00,0.000000e+00,0.000000e+00,3.520000e+07,0.000000e+00,0.000000e+00,2.550000e+07,0.000000e+00,2.300000e+07,5.700000e+07,0.000000e+00,0.000000e+00,0.000000e+00,2.550000e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.030000e+07,,Q9Z3G2
2019,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,219,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,PBS lyase HEAT-like repeat,1.0000,0.999,43,679,679,679,30,31,34,33,33,30,38,38,31,37,30,31,34,33,33,30,38,38,31,37,30,31,34,33,33,30,38,38,31,37,4.790000e+09,5.190000e+09,5.110000e+09,5.110000e+09,6.180000e+09,6.980000e+09,8.160000e+09,8.470000e+09,6.400000e+09,7.990000e+09,6.850000e+08,7.740000e+08,7.160000e+08,6.710000e+08,9.280000e+08,1.410000e+09,1.460000e+09,1.470000e+09,1.450000e+09,1.390000e+09,,Q9Z3G5


In [37]:
trypsin_pept[uniprot_col] = trypsin_pept[ProteinID_col_pept].str.split("|").str[1]
trypsin_pept[protein_col] = trypsin_pept[ProteinID_col_pept].str.split("|").str[2]
trypsin_pept

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_T1 Spectral Count,Dense_Light_T2 Spectral Count,Dense_Light_T3 Spectral Count,Dense_Light_T4 Spectral Count,Dense_Light_T5 Spectral Count,Dilute_Light_T1 Spectral Count,Dilute_Light_T2 Spectral Count,Dilute_Light_T3 Spectral Count,Dilute_Light_T4 Spectral Count,Dilute_Light_T5 Spectral Count,Dense_Light_T1 Intensity,Dense_Light_T2 Intensity,Dense_Light_T3 Intensity,Dense_Light_T4 Intensity,Dense_Light_T5 Intensity,Dilute_Light_T1 Intensity,Dilute_Light_T2 Intensity,Dilute_Light_T3 Intensity,Dilute_Light_T4 Intensity,Dilute_Light_T5 Intensity,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Dense_Light_T1 Match Type,Dense_Light_T2 Match Type,Dense_Light_T3 Match Type,Dense_Light_T4 Match Type,Dense_Light_T5 Match Type,Dilute_Light_T1 Match Type,Dilute_Light_T2 Match Type,Dilute_Light_T3 Match Type,Dilute_Light_T4 Match Type,Dilute_Light_T5 Match Type,UniProt
0,AAAADALGALR,Q,F,85,95,11,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q9Z3G5
1,AAAADLRPAR,K,F,186,195,10,23,Q31N03_SYNE7,Q31N03,Q31N03_SYNE7,Synpcc7942_1536,Probable amidotransferase,,,2,2,2,2,2,2,2,2,3,2,520000000.0,462000000.0,498000000.0,540000000.0,181000000.0,235000000.0,273000000.0,281000000.0,237000000.0,287000000.0,389000000.0,354000000.0,370000000.0,399000000.0,133000000.0,186000000.0,216000000.0,217000000.0,188000000.0,214000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31N03
2,AAAAEAPAALEASSDNPEPETSETPS,K,-,229,254,26,23,THF1_SYNE7,Q31MY4,THF1_SYNE7,thf1,Protein Thf1,,,2,2,2,2,2,2,2,2,1,3,715000000.0,756000000.0,550000000.0,802000000.0,498000000.0,484000000.0,474000000.0,431000000.0,398000000.0,431000000.0,651000000.0,585000000.0,462000000.0,687000000.0,464000000.0,492000000.0,466000000.0,386000000.0,398000000.0,468000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31MY4
3,AAAAEAVSQFAQR,K,R,188,200,13,2,Q31P68_SYNE7,Q31P68,Q31P68_SYNE7,Synpcc7942_1121,non-specific serine/threonine protein kinase,,,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,11900000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,Q31P68
4,AAAAEAVSQFAQRR,K,Q,188,201,14,23,Q31P68_SYNE7,Q31P68,Q31P68_SYNE7,Synpcc7942_1121,non-specific serine/threonine protein kinase,,,1,2,1,2,0,0,1,1,0,1,36300000.0,36200000.0,35600000.0,53600000.0,0.0,0.0,14600000.0,17100000.0,0.0,20300000.0,35000000.0,28800000.0,34400000.0,40800000.0,0.0,0.0,14100000.0,16500000.0,0.0,19600000.0,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,unmatched,MS/MS,MS/MS,unmatched,MS/MS,Q31P68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38824,YYTPDYTPKDTDLLAAFR,T,F,21,38,18,23,RBL_SYNE7,Q31NB3,RBL_SYNE7,cbbL,Ribulose bisphosphate carboxylase large chain,,,1,1,1,1,1,1,1,1,0,1,56900000.0,102000000.0,101000000.0,74700000.0,184000000.0,75400000.0,91400000.0,66200000.0,0.0,87200000.0,56900000.0,102000000.0,101000000.0,74700000.0,176000000.0,75400000.0,91400000.0,66200000.0,0.0,87200000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,Q31NB3
38825,YYVATDNLQSPR,K,Y,424,435,12,2,Q31R67_SYNE7,Q31R67,Q31R67_SYNE7,Synpcc7942_0420,PGM1 C-terminal domain-containing protein,,,1,1,1,1,1,1,1,1,1,1,279000000.0,210000000.0,188000000.0,231000000.0,190000000.0,174000000.0,0.0,0.0,161000000.0,0.0,279000000.0,210000000.0,188000000.0,231000000.0,190000000.0,174000000.0,0.0,0.0,161000000.0,0.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R67
38826,YYVDSIEGR,Y,W,152,160,9,2,Q31L83_SYNE7,Q31L83,Q31L83_SYNE7,Synpcc7942_2156,Glutamine synthetase,,,0,1,1,1,1,0,1,0,1,1,0.0,0.0,0.0,23000000.0,40200000.0,0.0,27400000.0,0.0,23000000.0,0.0,0.0,0.0,0.0,23000000.0,40200000.0,0.0,27400000.0,0.0,23000000.0,0.0,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MS/MS,Q31L83
38827,YYVEKGREDQGLR,R,T,233,245,13,3,Q31P53_SYNE7,Q31P53,Q31P53_SYNE7,Synpcc7942_1136,Aminopeptidase N,,,1,0,1,1,0,0,0,0,0,0,70000000.0,0.0,52100000.0,65400000.0,0.0,0.0,0.0,0.0,0.0,0.0,70000000.0,0.0,52100000.0,65400000.0,0.0,0.0,0.0,0.0,0.0,0.0,MS/MS,unmatched,MS/MS,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31P53


In [38]:
double_pept[uniprot_col] = double_pept[ProteinID_col_pept].str.split("|").str[1]
double_pept[protein_col] = double_pept[ProteinID_col_pept].str.split("|").str[2]
double_pept

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dilute_Light_DD1 Spectral Count,Dilute_Light_DD2 Spectral Count,Dilute_Light_DD3 Spectral Count,Dilute_Light_DD4 Spectral Count,Dilute_Light_DD5 Spectral Count,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dilute_Light_DD1 Intensity,Dilute_Light_DD2 Intensity,Dilute_Light_DD3 Intensity,Dilute_Light_DD4 Intensity,Dilute_Light_DD5 Intensity,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,Dilute_Light_DD1 Match Type,Dilute_Light_DD2 Match Type,Dilute_Light_DD3 Match Type,Dilute_Light_DD4 Match Type,Dilute_Light_DD5 Match Type,UniProt
0,AAAAAAGGFGR,L,V,79,89,11,2,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,1,1,0,0,1,1,1,0,1,1,47300000.0,63600000.0,52100000.0,60000000.0,56200000.0,37600000.0,41300000.0,0.0,36800000.0,35100000.0,47300000.0,63600000.0,52100000.0,60000000.0,56200000.0,37600000.0,41300000.0,0.0,36800000.0,35100000.0,MS/MS,MS/MS,MBR,MBR,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,Q31R01
1,AAAAALETLAPAATQLQSAIAETLNRQAHSDEERAVQARSRLAER,Y,R,157,201,45,6,CPCF_SYNE7,Q44116,CPCF_SYNE7,cpcF,Phycobilisome maturation protein,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q44116
2,AAAADALGALR,Q,F,85,95,11,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,Q9Z3G5
3,AAAADLRPAR,K,F,186,195,10,23,Q31N03_SYNE7,Q31N03,Q31N03_SYNE7,Synpcc7942_1536,Probable amidotransferase,,,2,2,2,2,1,1,1,2,2,2,174000000.0,173000000.0,189000000.0,231000000.0,62400000.0,91400000.0,116000000.0,143000000.0,139000000.0,135000000.0,125000000.0,123000000.0,134000000.0,161000000.0,62400000.0,91400000.0,83400000.0,104000000.0,101000000.0,98400000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31N03
4,AAAAEAPAALE,K,A,229,239,11,2,THF1_SYNE7,Q31MY4,THF1_SYNE7,thf1,Protein Thf1,,,1,1,0,0,0,1,0,1,0,0,16900000.0,12300000.0,0.0,0.0,0.0,13100000.0,0.0,13400000.0,14400000.0,0.0,16900000.0,12300000.0,0.0,0.0,0.0,13100000.0,0.0,13400000.0,14400000.0,0.0,MS/MS,MS/MS,unmatched,unmatched,unmatched,MS/MS,unmatched,MS/MS,MBR,unmatched,Q31MY4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49005,YYVDSIEGRWNSGREEEGGNLGYKPR,Y,Y,152,177,26,45,Q31L83_SYNE7,Q31L83,Q31L83_SYNE7,Synpcc7942_2156,Glutamine synthetase,,,2,0,2,2,0,0,0,0,0,0,132000000.0,57500000.0,90400000.0,113000000.0,0.0,0.0,0.0,0.0,0.0,0.0,72500000.0,55700000.0,48600000.0,61900000.0,0.0,0.0,0.0,0.0,0.0,0.0,MS/MS,MBR,MS/MS,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31L83
49006,YYVEEFTR,E,V,109,116,8,2,Q31M00_SYNE7,Q31M00,Q31M00_SYNE7,Synpcc7942_1889,Uncharacterized protein,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31M00
49007,YYVNDVPHIGSAYTTIAADAIAR,L,F,12,34,23,3,Q31P47_SYNE7,Q31P47,Q31P47_SYNE7,metG,Methionine--tRNA ligase,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,19800000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19800000.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31P47
49008,YYYIGAAAAPTGNAPLQSLLK,R,I,384,404,21,23,Q31RU1_SYNE7,Q31RU1,Q31RU1_SYNE7,Synpcc7942_0196,"Beta-carotene 15,15'-dioxygenase",,,0,0,0,0,0,2,0,2,2,2,0.0,0.0,0.0,0.0,0.0,16600000.0,0.0,14200000.0,17300000.0,17800000.0,0.0,0.0,0.0,0.0,0.0,16600000.0,0.0,14200000.0,17300000.0,17800000.0,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,MS/MS,MS/MS,MS/MS,Q31RU1


In [39]:
double_pept = generate_index(double_pept, uniprot_col, peptide_col, id_separator)
trypsin_pept = generate_index(trypsin_pept, uniprot_col, peptide_col, id_separator)
trypsin_prot = generate_index(trypsin_prot, uniprot_col)

In [40]:
print(trypsin_pept.shape)
print(double_pept.shape)
print(trypsin_prot.shape)

(38438, 56)
(48573, 56)
(1994, 67)


In [41]:
# Filtering out the protein groups with less than 2 peptides
trypsin_prot = filtering_protein_based_on_peptide_number(trypsin_prot, PeptCounts_col, search_tool, min_pept_count)
print(trypsin_prot.shape)

(1896, 68)


In [42]:
## this is the function to do the log2 transformation
double_pept = log2_transformation(double_pept, int_cols)
trypsin_pept = log2_transformation(trypsin_pept, int_cols)
trypsin_prot = log2_transformation(trypsin_prot, int_cols)

In [43]:
double_pept = median_normalization(double_pept, int_cols)
trypsin_pept = median_normalization(trypsin_pept, int_cols)
trypsin_prot = median_normalization(trypsin_prot, int_cols)

## Analyzing the data


In [44]:
print(double_pept.columns.to_list())
print(trypsin_pept.columns.to_list())
print(trypsin_prot.columns.to_list())

['Peptide Sequence', 'Prev AA', 'Next AA', 'Start', 'End', 'Peptide Length', 'Charges', 'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Mapped Genes', 'Mapped Proteins', 'Dense_Light_DD1 Spectral Count', 'Dense_Light_DD2 Spectral Count', 'Dense_Light_DD3 Spectral Count', 'Dense_Light_DD4 Spectral Count', 'Dense_Light_DD5 Spectral Count', 'Dilute_Light_DD1 Spectral Count', 'Dilute_Light_DD2 Spectral Count', 'Dilute_Light_DD3 Spectral Count', 'Dilute_Light_DD4 Spectral Count', 'Dilute_Light_DD5 Spectral Count', 'Dense_Light_DD1 Intensity', 'Dense_Light_DD2 Intensity', 'Dense_Light_DD3 Intensity', 'Dense_Light_DD4 Intensity', 'Dense_Light_DD5 Intensity', 'Dilute_Light_DD1 Intensity', 'Dilute_Light_DD2 Intensity', 'Dilute_Light_DD3 Intensity', 'Dilute_Light_DD4 Intensity', 'Dilute_Light_DD5 Intensity', 'Dense_Light_1', 'Dense_Light_2', 'Dense_Light_3', 'Dense_Light_4', 'Dense_Light_5', 'Dilute_Light_1', 'Dilute_Light_2', 'Dilute_Light_3', 'Dilute_Light_4', 'Dilute_Li

In [45]:
# Filtering by missingness
## NOTE: the missingness filtering of protein data is not the same as of the peptide data
# trypsin_prot = filter_missingness(trypsin_prot, groups, group_cols, missing_thr)
print(trypsin_prot.shape)

(1896, 68)


In [46]:
trypsin_prot = check_missingness(trypsin_prot, groups, group_cols)
trypsin_pept = check_missingness(trypsin_pept, groups, group_cols)
double_pept = check_missingness(double_pept, groups, group_cols)

In [47]:
if len(groups) > 2:
    trypsin_prot = anova(trypsin_prot, anova_cols, metadata)
    trypsin_prot = anova(trypsin_prot, anova_cols, metadata, anova_factors)
print(trypsin_prot.shape)

(1896, 71)


In [48]:
trypsin_prot = pairwise_ttest(trypsin_prot, pairwise_ttest_groups)
trypsin_prot = pairwise_ttest(trypsin_prot, dose_pairwise_ttest_groups)
print(trypsin_prot.shape)

  return f(*args, **kwargs)
  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
  return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)


(1896, 74)


In [49]:
# trypsin_prot = calculate_all_pairwise_scalars(trypsin_prot, pairwise_ttest_groups, sig_type, 1)
# trypsin_prot = calculate_all_pairwise_scalars(trypsin_prot, dose_pairwise_ttest_groups, sig_type, 1)
# print(trypsin_prot.shape)

In [50]:
trypsin_prot = calculate_all_pairwise_scalars(trypsin_prot, pairwise_ttest_groups, sig_type, sig_thr)

In [51]:
#trypsin_prot.to_csv(f"{result_dir}/{sample}_lip_trypsin_prot_processed.tsv", sep='\t')
#trypsin_prot.to_pickle(f"{result_dir}/{sample}_lip_trypsin_prot_processed.pkl")

In [52]:
double_pept_w = prot_abund_correction_sig_only(double_pept, trypsin_prot, pairwise_ttest_groups, uniprot_col, sig_type, sig_thr)
trypsin_pept_w = prot_abund_correction_sig_only(trypsin_pept, trypsin_prot, pairwise_ttest_groups, uniprot_col, sig_type, sig_thr)
#double_pept_w = prot_abund_correction(double_pept, trypsin_prot, int_cols, uniprot_col)
#trypsin_pept_w = prot_abund_correction(trypsin_pept, trypsin_prot, int_cols, uniprot_col)
# double_pept_w = filter_missingness(double_pept_w, groups, group_cols, missing_thr)
# trypsin_pept_w = filter_missingness(trypsin_pept_w, groups, group_cols, missing_thr)

In [53]:
if len(groups) > 2:
    double_pept_w = anova(double_pept_w, anova_cols, metadata)
    trypsin_pept_w = anova(trypsin_pept_w, anova_cols, metadata)
    double_pept_w = anova(double_pept_w, anova_cols, metadata, anova_factors)
    trypsin_pept_w = anova(trypsin_pept_w, anova_cols, metadata, anova_factors)
print(double_pept_w.shape)
print(trypsin_pept_w.shape)

(48573, 59)
(38438, 59)


In [54]:
double_pept_w = pairwise_ttest(double_pept_w, pairwise_ttest_groups)
trypsin_pept_w = pairwise_ttest(trypsin_pept_w, pairwise_ttest_groups)

In [55]:
double_pept_w = pairwise_ttest(double_pept_w, dose_pairwise_ttest_groups)
trypsin_pept_w = pairwise_ttest(trypsin_pept_w, dose_pairwise_ttest_groups)

  return f(*args, **kwargs)
  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
  return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
  return f(*args, **kwargs)
  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df


In [56]:
print(double_pept_w.columns.to_list())
print(trypsin_pept.columns.to_list())
print(trypsin_prot.columns.to_list())

['Peptide Sequence', 'Prev AA', 'Next AA', 'Start', 'End', 'Peptide Length', 'Charges', 'Protein', 'Protein ID', 'Entry Name', 'Gene', 'Protein Description', 'Mapped Genes', 'Mapped Proteins', 'Dense_Light_DD1 Spectral Count', 'Dense_Light_DD2 Spectral Count', 'Dense_Light_DD3 Spectral Count', 'Dense_Light_DD4 Spectral Count', 'Dense_Light_DD5 Spectral Count', 'Dilute_Light_DD1 Spectral Count', 'Dilute_Light_DD2 Spectral Count', 'Dilute_Light_DD3 Spectral Count', 'Dilute_Light_DD4 Spectral Count', 'Dilute_Light_DD5 Spectral Count', 'Dense_Light_DD1 Intensity', 'Dense_Light_DD2 Intensity', 'Dense_Light_DD3 Intensity', 'Dense_Light_DD4 Intensity', 'Dense_Light_DD5 Intensity', 'Dilute_Light_DD1 Intensity', 'Dilute_Light_DD2 Intensity', 'Dilute_Light_DD3 Intensity', 'Dilute_Light_DD4 Intensity', 'Dilute_Light_DD5 Intensity', 'Dense_Light_1', 'Dense_Light_2', 'Dense_Light_3', 'Dense_Light_4', 'Dense_Light_5', 'Dilute_Light_1', 'Dilute_Light_2', 'Dilute_Light_3', 'Dilute_Light_4', 'Dilute_Li

In [57]:
trypsin_pept

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_T1 Spectral Count,Dense_Light_T2 Spectral Count,Dense_Light_T3 Spectral Count,Dense_Light_T4 Spectral Count,Dense_Light_T5 Spectral Count,Dilute_Light_T1 Spectral Count,Dilute_Light_T2 Spectral Count,Dilute_Light_T3 Spectral Count,Dilute_Light_T4 Spectral Count,Dilute_Light_T5 Spectral Count,Dense_Light_T1 Intensity,Dense_Light_T2 Intensity,Dense_Light_T3 Intensity,Dense_Light_T4 Intensity,Dense_Light_T5 Intensity,Dilute_Light_T1 Intensity,Dilute_Light_T2 Intensity,Dilute_Light_T3 Intensity,Dilute_Light_T4 Intensity,Dilute_Light_T5 Intensity,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Dense_Light_T1 Match Type,Dense_Light_T2 Match Type,Dense_Light_T3 Match Type,Dense_Light_T4 Match Type,Dense_Light_T5 Match Type,Dilute_Light_T1 Match Type,Dilute_Light_T2 Match Type,Dilute_Light_T3 Match Type,Dilute_Light_T4 Match Type,Dilute_Light_T5 Match Type,UniProt,id,Total missingness,Dense_Light missingness,Dilute_Light missingness,Dilute_Light/Dense_Light,Dilute_Light/Dense_Light_pval,Dilute_Light/Dense_Light_adj-p
Q9Z3G5@AAAADALGALR,AAAADALGALR,Q,F,85,95,11,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q9Z3G5,Q9Z3G5@AAAADALGALR,10,5,5,0.000000,,
Q31N03@AAAADLRPAR,AAAADLRPAR,K,F,186,195,10,23,Q31N03_SYNE7,Q31N03,Q31N03_SYNE7,Synpcc7942_1536,Probable amidotransferase,,,2,2,2,2,2,2,2,2,3,2,520000000.0,462000000.0,498000000.0,540000000.0,181000000.0,235000000.0,273000000.0,281000000.0,237000000.0,287000000.0,28.476428,28.374841,28.459678,28.568542,27.056092,27.503256,27.711753,27.648028,27.497099,27.669764,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31N03,Q31N03@AAAADLRPAR,0,0,0,-0.581136,0.078209,0.255283
Q31MY4@AAAAEAPAALEASSDNPEPETSETPS,AAAAEAPAALEASSDNPEPETSETPS,K,-,229,254,26,23,THF1_SYNE7,Q31MY4,THF1_SYNE7,thf1,Protein Thf1,,,2,2,2,2,2,2,2,2,1,3,715000000.0,756000000.0,550000000.0,802000000.0,498000000.0,484000000.0,474000000.0,431000000.0,398000000.0,431000000.0,29.219315,29.099528,28.780046,29.352463,28.858790,28.906612,28.821052,28.478933,28.579135,28.798661,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31MY4,Q31MY4@AAAAEAPAALEASSDNPEPETSETPS,0,0,0,-0.345150,0.033076,0.124670
Q31P68@AAAAEAVSQFAQR,AAAAEAVSQFAQR,K,R,188,200,13,2,Q31P68_SYNE7,Q31P68,Q31P68_SYNE7,Synpcc7942_1121,non-specific serine/threonine protein kinase,,,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,11900000.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,Q31P68,Q31P68@AAAAEAVSQFAQR,10,5,5,0.000000,,
Q31P68@AAAAEAVSQFAQRR,AAAAEAVSQFAQRR,K,Q,188,201,14,23,Q31P68_SYNE7,Q31P68,Q31P68_SYNE7,Synpcc7942_1121,non-specific serine/threonine protein kinase,,,1,2,1,2,0,0,1,1,0,1,36300000.0,36200000.0,35600000.0,53600000.0,0.0,0.0,14600000.0,17100000.0,0.0,20300000.0,25.002085,24.755232,25.032633,25.278794,,,23.774489,23.930871,,24.221078,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,unmatched,MS/MS,MS/MS,unmatched,MS/MS,Q31P68,Q31P68@AAAAEAVSQFAQRR,3,1,2,-1.041707,0.001568,0.010155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q31NB3@YYTPDYTPKDTDLLAAFR,YYTPDYTPKDTDLLAAFR,T,F,21,38,18,23,RBL_SYNE7,Q31NB3,RBL_SYNE7,cbbL,Ribulose bisphosphate carboxylase large chain,,,1,1,1,1,1,1,1,1,0,1,56900000.0,102000000.0,101000000.0,74700000.0,184000000.0,75400000.0,91400000.0,66200000.0,0.0,87200000.0,25.703158,26.579660,26.586508,26.151333,27.460241,26.200590,26.470988,25.935236,,26.374553,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,Q31NB3,Q31NB3@YYTPDYTPKDTDLLAAFR,1,0,1,-0.250838,0.491526,1.000000
Q31R67@YYVATDNLQSPR,YYVATDNLQSPR,K,Y,424,435,12,2,Q31R67_SYNE7,Q31R67,Q31R67_SYNE7,Synpcc7942_0420,PGM1 C-terminal domain-containing protein,,,1,1,1,1,1,1,1,1,1,1,279000000.0,210000000.0,188000000.0,231000000.0,190000000.0,174000000.0,0.0,0.0,161000000.0,0.0,27.996923,27.621481,27.482885,27.780046,27.570665,27.407041,,,27.273427,,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R67,Q31R67@YYVATDNLQSPR,3,0,3,-0.350166,0.074316,0.244610
Q31L83@YYVDSIEGR,YYVDSIEGR,Y,W,152,160,9,2,Q31L83_SYNE7,Q31L83,Q31L83_SYNE7,Synpcc7942_2156,Glutamine synthetase,,,0,1,1,1,1,0,1,0,1,1,0.0,0.0,0.0,23000000.0,40200000.0,0.0,27400000.0,0.0,23000000.0,0.0,,,,24.451859,25.329933,,24.732970,,24.466072,,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MS/MS,Q31L83,Q31L83@YYVDSIEGR,6,3,3,-0.291375,0.590394,1.000000
Q31P53@YYVEKGREDQGLR,YYVEKGREDQGLR,R,T,233,245,13,3,Q31P53_SYNE7,Q31P53,Q31P53_SYNE7,Synpcc7942_1136,Aminopeptidase N,,,1,0,1,1,0,0,0,0,0,0,70000000.0,0.0,52100000.0,65400000.0,0.0,0.0,0.0,0.0,0.0,0.0,26.002085,,25.631508,25.959515,,,,,,,MS/MS,unmatched,MS/MS,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31P53,Q31P53@YYVEKGREDQGLR,7,2,5,0.000000,,


#### Rolling up the single site level using the full peptide


### LiP Rollup Function

In [58]:
# Rollup to site level, NB: this is for individual proteins, because the protein sequence is needed
# This function is to roll up the LiP pept data to the site level with median values
def LiP_rollup_to_site(pept, int_cols, sequence, uniprot_col, residue_col="Residue", uniprot_id="Protein ID (provided by user)", peptide_col="Sequence", clean_pept_col="clean_pept", id_col="id", id_separator="@", pept_type_col="pept_type", site_col="Site", pos_col="Pos", multiply_rollup_counts=True, ignore_NA=True, rollup_func="median"):
    """_summary_

    Args:
        pept (_type_): _description_
        sequence (_type_): _description_
        uniprot_id (str, optional): _description_. Defaults to "".

    Raises:
        ValueError: _description_
    """
    # seq_len = len(sequence)
    if clean_pept_col not in pept.columns.to_list():
        pept = get_tryptic_types(pept, sequence, peptide_col, clean_pept_col)
    if pept.shape[0] > 0:
        pept = get_clean_peptides(pept, peptide_col, clean_pept_col)
        pept[residue_col] = [[res + str(sequence.find(clean_pept)+i+1) for i, res in enumerate(clean_pept)] for clean_pept in pept[clean_pept_col]]
        info_cols = [col for col in pept.columns if col not in int_cols]
        pept = pept.explode(residue_col)
        pept[id_col] = uniprot_id + id_separator + pept[residue_col] + id_separator + pept[pept_type_col]
        # pept[id_col] = uniprot_id + id_separator + pept[residue_col]
        # pept[int_cols] = 2 ** (pept[int_cols])
        # pept_grouped = pept[int_cols].groupby(pept.index).sum(min_count=1)
        # pept_grouped = log2_transformation(pept_grouped)
        # # Lisa Bramer and Kelly Straton suggested to use median of log2 scale values rathen than summing up the intenisty values at linear scale
        info_cols_wo_peptide_col = [col for col in info_cols if col != peptide_col]
        agg_methods_0 = {peptide_col: lambda x: '; '.join(x)}
        agg_methods_1 = {i: lambda x: x.iloc[0] for i in info_cols_wo_peptide_col}
        if multiply_rollup_counts:
            if ignore_NA:
                if rollup_func.lower() == "median":
                    agg_methods_2 = {i: lambda x: np.log2(len(x)) + x.median() for i in int_cols}
                elif rollup_func.lower() == "mean":
                    agg_methods_2 = {i: lambda x: np.log2(len(x)) + x.mean() for i in int_cols}
                elif rollup_func.lower() == "sum":
                    agg_methods_2 = {i: lambda x: np.log2(np.nansum(2**(x.replace(0, np.nan)))) for i in int_cols}
                else:
                    ValueError("The rollup function is not recognized. Please choose from the following: median, mean, sum")
            else:
                if rollup_func.lower() == "median":
                    agg_methods_2 = {i: lambda x: np.log2(x.notna().sum()) + x.median() for i in int_cols}
                elif rollup_func.lower() == "mean":
                    agg_methods_2 = {i: lambda x: np.log2(x.notna().sum()) + x.mean() for i in int_cols}
                elif rollup_func.lower() == "sum":
                    agg_methods_2 = {i: lambda x: np.log2(np.nansum(2**(x.replace(0, np.nan)))) for i in int_cols}
                else:
                    ValueError("The rollup function is not recognized. Please choose from the following: median, mean, sum")
        else:
            if rollup_func.lower() == "median":
                agg_methods_2 = {i: lambda x: x.median() for i in int_cols}
            elif rollup_func.lower() == "mean":
                agg_methods_2 = {i: lambda x: x.mean() for i in int_cols}
            elif rollup_func.lower() == "sum":
                agg_methods_2 = {i: lambda x: np.log2(np.nansum(2**(x.replace(0, np.nan)))) for i in int_cols}
            else:
                ValueError("The rollup function is not recognized. Please choose from the following: median, mean, sum")
        pept_grouped = pept.groupby(id_col, as_index=False).agg({**agg_methods_0, **agg_methods_1, **agg_methods_2})
        pept_grouped[uniprot_col] = uniprot_id
        pept_grouped[site_col] = [site.split(id_separator)[1] for site in pept_grouped[id_col]]
        pept_grouped[pos_col] = [int(re.sub(r"\D", "", site)) for site in pept_grouped[site_col]]
        pept_grouped.sort_values(by=[pos_col], inplace=True)
        pept_grouped[pept_type_col] = [site.split(id_separator)[-1] for site in pept_grouped[id_col]]
        # pept_grouped.index = uniprot_id + id_separator + pept_grouped["Site"]
        pept_grouped.index = pept_grouped[id_col].to_list()
        return pept_grouped
    else:
        raise ValueError("The pept dataframe is empty. Please check the input dataframe.")


In [59]:
groups

['Dense_Light', 'Dilute_Light']

In [60]:
## Now apply these functions to the whole proteome
double_pept_t = []
double_pept_p = []
for uniprot_id in double_pept_w[uniprot_col].unique():
    pept_df = double_pept_w[double_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    double_pept_t.append(pept_df)
    pept_df_r = LiP_rollup_to_site(pept_df, int_cols, prot_seq, uniprot_col, uniprot_id = uniprot_id, peptide_col=peptide_col, rollup_func="median")
    if len(groups) > 2:
        pept_df_a = anova(pept_df_r, anova_cols, metadata)
        pept_df_a = anova(pept_df_r, anova_cols, metadata, anova_factors)
    else:
        pept_df_a = pept_df_r.copy()
    pept_df_p = pairwise_ttest(pept_df_a, pairwise_ttest_groups)
    double_pept_p.append(pept_df_p)
double_pept_t = pd.concat(double_pept_t).copy()
double_pept_p = pd.concat(double_pept_p).copy()

In [61]:
double_pept_p

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dilute_Light_DD1 Spectral Count,Dilute_Light_DD2 Spectral Count,Dilute_Light_DD3 Spectral Count,Dilute_Light_DD4 Spectral Count,Dilute_Light_DD5 Spectral Count,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dilute_Light_DD1 Intensity,Dilute_Light_DD2 Intensity,Dilute_Light_DD3 Intensity,Dilute_Light_DD4 Intensity,Dilute_Light_DD5 Intensity,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,Dilute_Light_DD1 Match Type,Dilute_Light_DD2 Match Type,Dilute_Light_DD3 Match Type,Dilute_Light_DD4 Match Type,Dilute_Light_DD5 Match Type,UniProt,id,Total missingness,Dense_Light missingness,Dilute_Light missingness,Dilute_Light/Dense_Light,Dilute_Light/Dense_Light_pval,Dilute_Light/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,pept_id,lytic_group,Residue,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Site,Pos
Q31R01@A2@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@A2@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,A2,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,A2,2
Q31R01@D3@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@D3@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,D3,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,D3,3
Q31R01@L4@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@L4@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,L4,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,L4,4
Q31R01@L5@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@L5@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,L5,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,L5,5
Q31R01@H6@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@H6@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,H6,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,H6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q31NL8@A229@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@A229@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,A229,,,,,,,,,,,A229,229
Q31NL8@Q230@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@Q230@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,Q230,,,,,,,,,,,Q230,230
Q31NL8@Q231@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@Q231@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,Q231,,,,,,,,,,,Q231,231
Q31NL8@V232@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@V232@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,V232,,,,,,,,,,,V232,232


In [62]:
test_uniprot_id = "O33698"
test_seq = "MSPSAANTPSYDDFALALEAQSLDSQKGQLVRGKVCEYSTDGAYIDIGGKAPAFLPKREAALHAVLDLEAHLPKDEELEFLVIRDQNEDGQVTVSLRALALEQAWTRVAELQEGGQTVQVKVTGSNKGGVTADLEGLRAFIPRSHLNEKEDLDSLKGKTLTVAFLEVNRADKKLVLSERQAARTALVREIEVGQLINGKVTGLKPFGVFVDLGGATALLPINQISQKFVADVGAIFKIGDPIQALVVAIDNTKGRISLSTKVLENHPGEILENVAELQASAADRAERARKQLESQ"
test_comparison = "Dilute_Light/Dense_Light"

In [None]:
#test_uniprot_id = "Q03513"
#test_seq = "MPSPTTVPVATAGRLAEPYIDPAAQVHAIASIIGDVRIAAGVRVAAGVSIRADEGAPFQVGKESILQEGAVIHGLEYGRVLGDDQADYSVWIGQRVAITHKALIHGPAYLGDDCFVGFRSTVFNARVGAGSVIMMHALVQDVEIPPGRYVPSGAIITTQQQADRLPEVRPEDREFARHIIGSPPVIVRSTPAATADFHSTPTPSPLRPSSSEATTVSAYNGQGRLSSEVITQVRSLLNQGYRIGTEHADKRRFRTSSWQPCAPIQSTNERQVLSELENCLSEHEGEYVRLLGIDTNTRSRVFEALIQRPDGSVPESLGSQPVAVASGGGRQSSYASVSGNLSAEVVNKVRNLLAQGYRIGTEHADKRRFRTSSWQSCAPIQSSNERQVLAELENCLSEHEGEYVRLLGIDTASRSRVFEALIQDPQGPVGSAKAAAAPVSSATPSSHSYTSNGSSSSDVAGQVRGLLAQGYRISAEVADKRRFQTSSWQSLPALSGQSEATVLPALESILQEHKGKYVRLIGIDPAARRRVAELLIQKP"
#test_comparison = "Dilute_Light/Dense_Light"

In [63]:
trip_test = double_pept_p[double_pept_p["Protein ID"] == test_uniprot_id].copy()

In [64]:
double_pept_p

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dilute_Light_DD1 Spectral Count,Dilute_Light_DD2 Spectral Count,Dilute_Light_DD3 Spectral Count,Dilute_Light_DD4 Spectral Count,Dilute_Light_DD5 Spectral Count,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dilute_Light_DD1 Intensity,Dilute_Light_DD2 Intensity,Dilute_Light_DD3 Intensity,Dilute_Light_DD4 Intensity,Dilute_Light_DD5 Intensity,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,Dilute_Light_DD1 Match Type,Dilute_Light_DD2 Match Type,Dilute_Light_DD3 Match Type,Dilute_Light_DD4 Match Type,Dilute_Light_DD5 Match Type,UniProt,id,Total missingness,Dense_Light missingness,Dilute_Light missingness,Dilute_Light/Dense_Light,Dilute_Light/Dense_Light_pval,Dilute_Light/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,pept_id,lytic_group,Residue,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Site,Pos
Q31R01@A2@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@A2@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,A2,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,A2,2
Q31R01@D3@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@D3@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,D3,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,D3,3
Q31R01@L4@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@L4@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,L4,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,L4,4
Q31R01@L5@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@L5@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,L5,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,L5,5
Q31R01@H6@Semi-tryptic,ADLLHQIR,M,V,2,9,8,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,2,1,2,4,2,2,2,2,2,2,204000000.0,94900000.0,189000000.0,183000000.0,155000000.0,188000000.0,180000000.0,185000000.0,204000000.0,178000000.0,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@H6@Semi-tryptic,0,0,0,0.182349,0.044427,0.159657,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ADLLHQIR,2,9,Semi-tryptic,17,10,0002-0009@ADLLHQIR,0,H6,26.72395,26.409374,26.556734,26.49773,26.308295,26.679966,26.633196,26.727404,26.768619,26.598646,H6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q31NL8@A229@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@A229@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,A229,,,,,,,,,,,A229,229
Q31NL8@Q230@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@Q230@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,Q230,,,,,,,,,,,Q230,230
Q31NL8@Q231@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@Q231@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,Q231,,,,,,,,,,,Q231,231
Q31NL8@V232@Tryptic,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@V232@Tryptic,10,5,5,0.000000,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0225-0233@YLEAAQQVR,1,V232,,,,,,,,,,,V232,232


In [73]:
trip_test[trip_test["pept_type"] == "Semi-tryptic"]

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dilute_Light_DD1 Spectral Count,Dilute_Light_DD2 Spectral Count,Dilute_Light_DD3 Spectral Count,Dilute_Light_DD4 Spectral Count,Dilute_Light_DD5 Spectral Count,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dilute_Light_DD1 Intensity,Dilute_Light_DD2 Intensity,Dilute_Light_DD3 Intensity,Dilute_Light_DD4 Intensity,Dilute_Light_DD5 Intensity,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,Dilute_Light_DD1 Match Type,Dilute_Light_DD2 Match Type,Dilute_Light_DD3 Match Type,Dilute_Light_DD4 Match Type,Dilute_Light_DD5 Match Type,UniProt,id,Total missingness,Dense_Light missingness,Dilute_Light missingness,Dilute_Light/Dense_Light,Dilute_Light/Dense_Light_pval,Dilute_Light/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,pept_id,lytic_group,Residue,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dilute_Light_1,Dilute_Light_2,Dilute_Light_3,Dilute_Light_4,Dilute_Light_5,Site,Pos
O33698@P3@Semi-tryptic,PSAANTPSYDDFALALEAQSLDSQKGQLVR,S,G,3,32,30,34,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,2,0,1,3,1,2,2,2,0.0,31500000.0,0.0,32700000.0,35300000.0,66200000.0,65300000.0,18700000.0,93400000.0,84700000.0,unmatched,MBR,MS/MS,MBR,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,O33698,O33698@P3@Semi-tryptic,2,2,0,0.812605,0.151947,0.447347,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,PSAANTPSYDDFALALEAQSLDSQKGQLVR,3,32,Semi-tryptic,24,41,0003-0032@PSAANTPSYDDFALALEAQSLDSQKGQLVR,0,P3,,24.816799,,24.870737,25.032039,25.797149,26.027034,24.279945,26.306447,26.185075,P3,3
O33698@S4@Semi-tryptic,PSAANTPSYDDFALALEAQSLDSQKGQLVR; SAANTPSYDDFALA...,S,G,3,32,30,34,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,2,0,1,3,1,2,2,2,0.0,31500000.0,0.0,32700000.0,35300000.0,66200000.0,65300000.0,18700000.0,93400000.0,84700000.0,unmatched,MBR,MS/MS,MBR,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,O33698,O33698@S4@Semi-tryptic,2,2,0,0.812605,0.151947,0.447347,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,PSAANTPSYDDFALALEAQSLDSQKGQLVR,3,32,Semi-tryptic,24,41,0003-0032@PSAANTPSYDDFALALEAQSLDSQKGQLVR,0,S4,,25.816799,,25.870737,26.032039,26.023602,26.458723,25.279945,26.599224,26.462411,S4,4
O33698@A5@Semi-tryptic,PSAANTPSYDDFALALEAQSLDSQKGQLVR; SAANTPSYDDFALA...,S,G,3,32,30,34,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,2,0,1,3,1,2,2,2,0.0,31500000.0,0.0,32700000.0,35300000.0,66200000.0,65300000.0,18700000.0,93400000.0,84700000.0,unmatched,MBR,MS/MS,MBR,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,O33698,O33698@A5@Semi-tryptic,2,2,0,0.812605,0.151947,0.447347,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,PSAANTPSYDDFALALEAQSLDSQKGQLVR,3,32,Semi-tryptic,24,41,0003-0032@PSAANTPSYDDFALALEAQSLDSQKGQLVR,0,A5,,25.816799,,25.870737,26.032039,26.023602,26.458723,25.279945,26.599224,26.462411,A5,5
O33698@A6@Semi-tryptic,PSAANTPSYDDFALALEAQSLDSQKGQLVR; SAANTPSYDDFALA...,S,G,3,32,30,34,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,2,0,1,3,1,2,2,2,0.0,31500000.0,0.0,32700000.0,35300000.0,66200000.0,65300000.0,18700000.0,93400000.0,84700000.0,unmatched,MBR,MS/MS,MBR,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,O33698,O33698@A6@Semi-tryptic,2,2,0,0.812605,0.151947,0.447347,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,PSAANTPSYDDFALALEAQSLDSQKGQLVR,3,32,Semi-tryptic,24,41,0003-0032@PSAANTPSYDDFALALEAQSLDSQKGQLVR,0,A6,,25.816799,,25.870737,26.032039,26.023602,26.458723,25.279945,26.599224,26.462411,A6,6
O33698@N7@Semi-tryptic,NTPSYDDFALALEAQSLDSQKGQLVR; PSAANTPSYDDFALALEA...,A,G,7,32,26,3,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,1,0,0,1,1,0,0,0,0.0,26500000.0,51200000.0,0.0,0.0,73900000.0,65300000.0,0.0,84900000.0,0.0,unmatched,MBR,MS/MS,unmatched,unmatched,MS/MS,MS/MS,unmatched,MBR,unmatched,O33698,O33698@N7@Semi-tryptic,5,3,2,1.143437,0.058068,0.199940,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,NTPSYDDFALALEAQSLDSQKGQLVR,7,32,Semi-tryptic,24,41,0007-0032@NTPSYDDFALALEAQSLDSQKGQLVR,0,N7,,26.277081,27.119329,26.455700,26.617001,27.382112,27.611996,25.864907,27.891410,27.047374,N7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O33698@D283@Semi-tryptic,LQASAADR; LQASAADRAER,E,A,277,284,8,2,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,O33698,O33698@D283@Semi-tryptic,10,5,5,0.000000,,,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,LQASAADR,277,284,Semi-tryptic,24,41,0277-0284@LQASAADR,20,D283,,,,,,,,,,,D283,283
O33698@R284@Semi-tryptic,LQASAADR; LQASAADRAER,E,A,277,284,8,2,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,O33698,O33698@R284@Semi-tryptic,10,5,5,0.000000,,,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,LQASAADR,277,284,Semi-tryptic,24,41,0277-0284@LQASAADR,20,R284,,,,,,,,,,,R284,284
O33698@A285@Semi-tryptic,LQASAADRAER,E,A,277,287,11,3,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,1,1,1,1,0,1,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MS/MS,O33698,O33698@A285@Semi-tryptic,10,5,5,0.000000,,,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,LQASAADRAER,277,287,Semi-tryptic,24,41,0277-0287@LQASAADRAER,21,A285,,,,,,,,,,,A285,285
O33698@E286@Semi-tryptic,LQASAADRAER,E,A,277,287,11,3,RS1_SYNE7,O33698,RS1_SYNE7,rpsA,Small ribosomal subunit protein bS1,,,0,1,1,1,1,0,1,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,unmatched,MS/MS,MS/MS,O33698,O33698@E286@Semi-tryptic,10,5,5,0.000000,,,sp|O33698|RS1_SYNE7 Small ribosomal subunit pr...,295,LQASAADRAER,277,287,Semi-tryptic,24,41,0277-0287@LQASAADRAER,21,E286,,,,,,,,,,,E286,286


In [74]:

plot_pept_barcode(
    trip_test[trip_test["pept_type"] == "Semi-tryptic"],
    test_comparison,
    test_seq,
    color_levels=1,
    save2file=test_uniprot_id
)

In [84]:
#double_pept_p.to_csv("double_pept_p.csv", sep='\t', index=None)

In [71]:
trip_test.to_csv("~/Downloads/trip_test.csv", sep='\t')

## Now apply these functions to the whole proteome
double_pept_t = []
for uniprot_id in double_pept_w[uniprot_col].unique():
    pept_df = double_pept_w[double_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups + dose_pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    double_pept_t.append(pept_df)
double_pept_t = pd.concat(double_pept_t).copy()

In [53]:
pept_df

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Dark_DD1 Spectral Count,Dense_Dark_DD2 Spectral Count,Dense_Dark_DD3 Spectral Count,Dense_Dark_DD4 Spectral Count,Dense_Dark_DD5 Spectral Count,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dense_Dark_DD1 Intensity,Dense_Dark_DD2 Intensity,Dense_Dark_DD3 Intensity,Dense_Dark_DD4 Intensity,Dense_Dark_DD5 Intensity,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dense_Dark_1,Dense_Dark_2,Dense_Dark_3,Dense_Dark_4,Dense_Dark_5,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dense_Dark_DD1 Match Type,Dense_Dark_DD2 Match Type,Dense_Dark_DD3 Match Type,Dense_Dark_DD4 Match Type,Dense_Dark_DD5 Match Type,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,UniProt,id,Total missingness,Dense_Dark missingness,Dense_Light missingness,Dense_Dark/Dense_Light_scalar,Dense_Dark/Dense_Light,Dense_Dark/Dense_Light_pval,Dense_Dark/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,Dense_Dark/Dense_Light Sig Semi Pept num,Max absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Sum absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Median absFC of All Dense_Dark/Dense_Light Sig Semi Pept,pept_id,lytic_group
Q31NL8@YLEAAQQVR,YLEAAQQVR,R,T,225,233,9,2,Q31NL8_SYNE7,Q31NL8,Q31NL8_SYNE7,Synpcc7942_1321,Uncharacterized protein,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31NL8,Q31NL8@YLEAAQQVR,10,5,5,0.0,0.0,,,tr|Q31NL8|Q31NL8_SYNE7 Uncharacterized protein...,339,YLEAAQQVR,225,233,Tryptic,1,0,0,,,,0225-0233@YLEAAQQVR,1


trypsin_pept_t = []
for uniprot_id in trypsin_pept_w[uniprot_col].unique():
    pept_df = trypsin_pept_w[trypsin_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups + dose_pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    trypsin_pept_t.append(pept_df)
trypsin_pept_t = pd.concat(trypsin_pept_t).copy()

In [None]:
trypsin_pept_t = []
trypsin_pept_p = []
for uniprot_id in trypsin_pept_w[uniprot_col].unique():
    pept_df = trypsin_pept_w[trypsin_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    trypsin_pept_t.append(pept_df)
    pept_df_r = LiP_rollup_to_site(pept_df, int_cols, prot_seq, uniprot_col, uniprot_id = uniprot_id, peptide_col=peptide_col, rollup_func="median")
    if len(groups) > 2:
        pept_df_a = anova(pept_df_r, anova_cols, metadata)
        pept_df_a = anova(pept_df_r, anova_cols, metadata, anova_factors)
    else:
        pept_df_a = pept_df_r.copy()
    pept_df_p = pairwise_ttest(pept_df_a, pairwise_ttest_groups)
    trypsin_pept_p.append(pept_df_p)
trypsin_pept_t = pd.concat(trypsin_pept_t).copy()
trypsin_pept_p = pd.concat(trypsin_pept_p).copy()

In [55]:
#double_pept_t.to_csv(f"{result_dir}/{sample}_lip_double_pept_processed.tsv", sep='\t')
#trypsin_pept_t.to_csv(f"{result_dir}/{sample}_lip_trypsin_pept_processed.tsv", sep='\t')

In [56]:
#double_pept_t
#trypsin_pept_t
#trypsin_prot

In [58]:
trypsin_pept_t

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Dark_T1 Spectral Count,Dense_Dark_T2 Spectral Count,Dense_Dark_T3 Spectral Count,Dense_Dark_T4 Spectral Count,Dense_Dark_T5 Spectral Count,Dense_Light_T1 Spectral Count,Dense_Light_T2 Spectral Count,Dense_Light_T3 Spectral Count,Dense_Light_T4 Spectral Count,Dense_Light_T5 Spectral Count,Dense_Dark_T1 Intensity,Dense_Dark_T2 Intensity,Dense_Dark_T3 Intensity,Dense_Dark_T4 Intensity,Dense_Dark_T5 Intensity,Dense_Light_T1 Intensity,Dense_Light_T2 Intensity,Dense_Light_T3 Intensity,Dense_Light_T4 Intensity,Dense_Light_T5 Intensity,Dense_Dark_1,Dense_Dark_2,Dense_Dark_3,Dense_Dark_4,Dense_Dark_5,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dense_Dark_T1 Match Type,Dense_Dark_T2 Match Type,Dense_Dark_T3 Match Type,Dense_Dark_T4 Match Type,Dense_Dark_T5 Match Type,Dense_Light_T1 Match Type,Dense_Light_T2 Match Type,Dense_Light_T3 Match Type,Dense_Light_T4 Match Type,Dense_Light_T5 Match Type,UniProt,id,Total missingness,Dense_Dark missingness,Dense_Light missingness,Dense_Dark/Dense_Light_scalar,Dense_Dark/Dense_Light,Dense_Dark/Dense_Light_pval,Dense_Dark/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,Dense_Dark/Dense_Light Sig Semi Pept num,Max absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Sum absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Median absFC of All Dense_Dark/Dense_Light Sig Semi Pept,pept_id,lytic_group
Q9Z3G5@AAAVAAAAR,AAAVAAAAR,K,G,205,213,9,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q9Z3G5,Q9Z3G5@AAAVAAAAR,10,5,5,0.0,0.000000,,,tr|Q9Z3G5|Q9Z3G5_SYNE7 PBS lyase HEAT-like rep...,219,AAAVAAAAR,205,213,Tryptic,26,16,2,0.731611,1.120817,0.560408,0205-0213@AAAVAAAAR,1
Q9Z3G5@AAAVAAAARGDGLEA,AAAVAAAARGDGLEA,K,-,205,219,15,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,1,1,1,1,1,1,1,1,1,1,37800000.0,28500000.0,32600000.0,30400000.0,39500000.0,26000000.0,24200000.0,21300000.0,25300000.0,29700000.0,25.145694,24.747372,24.950442,24.804412,25.283638,24.633300,24.548411,24.300127,24.584707,24.910981,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q9Z3G5,Q9Z3G5@AAAVAAAARGDGLEA,0,0,0,0.0,0.390806,0.023998,1.0,tr|Q9Z3G5|Q9Z3G5_SYNE7 PBS lyase HEAT-like rep...,219,AAAVAAAARGDGLEA,205,219,Tryptic,26,16,2,0.731611,1.120817,0.560408,0205-0219@AAAVAAAARGDGLEA,2
Q9Z3G5@AFDLLAAAIESPVELIR,AFDLLAAAIESPVELIR,R,T,131,147,17,23,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,1,0,1,0,0,1,1,1,0,1,0.0,79400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51800000.0,,26.225549,,,,,,,,25.713470,MS/MS,MBR,MS/MS,unmatched,unmatched,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,Q9Z3G5,Q9Z3G5@AFDLLAAAIESPVELIR,8,4,4,0.0,0.512079,,,tr|Q9Z3G5|Q9Z3G5_SYNE7 PBS lyase HEAT-like rep...,219,AFDLLAAAIESPVELIR,131,147,Tryptic,26,16,2,0.731611,1.120817,0.560408,0131-0147@AFDLLAAAIESPVELIR,3
Q9Z3G5@AIADSNPR,AIADSNPR,K,V,40,47,8,2,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,1,1,1,1,1,1,1,1,1,1,414000000.0,370000000.0,293000000.0,293000000.0,281000000.0,346000000.0,341000000.0,329000000.0,332000000.0,404000000.0,28.598866,28.445863,28.118398,28.073169,28.114284,28.367488,28.365104,28.249289,28.298681,28.676801,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q9Z3G5,Q9Z3G5@AIADSNPR,0,0,0,0.0,-0.121357,0.376904,1.0,tr|Q9Z3G5|Q9Z3G5_SYNE7 PBS lyase HEAT-like rep...,219,AIADSNPR,40,47,Tryptic,26,16,2,0.731611,1.120817,0.560408,0040-0047@AIADSNPR,4
Q9Z3G5@AIADSNPRVR,AIADSNPRVR,K,Y,40,49,10,23,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,PBS lyase HEAT-like repeat,,,2,2,2,2,2,2,2,2,2,2,353000000.0,299000000.0,336000000.0,320000000.0,305000000.0,305000000.0,320000000.0,370000000.0,339000000.0,345000000.0,27.966075,27.753685,27.940450,27.815050,27.819061,27.790841,27.905672,28.026897,27.934869,28.079286,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q9Z3G5,Q9Z3G5@AIADSNPRVR,0,0,0,0.0,-0.088649,0.205686,1.0,tr|Q9Z3G5|Q9Z3G5_SYNE7 PBS lyase HEAT-like rep...,219,AIADSNPRVR,40,49,Tryptic,26,16,2,0.731611,1.120817,0.560408,0040-0049@AIADSNPRVR,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q31ML9@VVSSSSAGPK,VVSSSSAGPK,A,V,15,24,10,2,Q31ML9_SYNE7,Q31ML9,Q31ML9_SYNE7,Synpcc7942_1670,Uncharacterized protein,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31ML9,Q31ML9@VVSSSSAGPK,10,5,5,0.0,0.000000,,,tr|Q31ML9|Q31ML9_SYNE7 Uncharacterized protein...,48,VVSSSSAGPK,15,24,Semi-tryptic,0,1,0,,,,0015-0024@VVSSSSAGPK,0
Q31S77@VVTTYPTDER,VVTTYPTDER,K,A,67,76,10,2,Q31S77_SYNE7,Q31S77,Q31S77_SYNE7,Synpcc7942_0060,Uncharacterized protein,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31S77,Q31S77@VVTTYPTDER,10,5,5,0.0,0.000000,,,tr|Q31S77|Q31S77_SYNE7 Uncharacterized protein...,92,VVTTYPTDER,67,76,Tryptic,1,0,0,,,,0067-0076@VVTTYPTDER,1
Q31PY8@VYLDSLFR,VYLDSLFR,R,Q,56,63,8,2,Q31PY8_SYNE7,Q31PY8,Q31PY8_SYNE7,Synpcc7942_0850,DUF2811 domain-containing protein,,,1,1,1,1,1,1,1,1,0,1,24100000.0,28700000.0,27200000.0,25300000.0,28500000.0,21900000.0,23400000.0,23700000.0,0.0,27000000.0,24.496341,24.757460,24.689176,24.539478,24.812748,24.385719,24.499912,24.454161,,24.773478,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,unmatched,MS/MS,Q31PY8,Q31PY8@VYLDSLFR,1,0,1,0.0,0.130723,0.240656,1.0,tr|Q31PY8|Q31PY8_SYNE7 DUF2811 domain-containi...,66,VYLDSLFR,56,63,Tryptic,1,0,0,,,,0056-0063@VYLDSLFR,1
Q31Q28@YALQQPAVQALLR,YALQQPAVQALLR,R,E,469,481,13,2,Q31Q28_SYNE7,Q31Q28,Q31Q28_SYNE7,Synpcc7942_0809,Glycosyltransferase RgtA/B/C/D-like domain-con...,,,0,1,1,1,1,1,1,1,1,1,0.0,63000000.0,62900000.0,0.0,58800000.0,71200000.0,71400000.0,0.0,53100000.0,41300000.0,,25.891762,25.898630,,25.857602,26.086665,26.109328,,25.654282,25.386660,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31Q28,Q31Q28@YALQQPAVQALLR,3,2,1,0.0,0.073431,0.738371,1.0,tr|Q31Q28|Q31Q28_SYNE7 Glycosyltransferase Rgt...,580,YALQQPAVQALLR,469,481,Tryptic,1,0,0,,,,0469-0481@YALQQPAVQALLR,1


In [59]:
double_pept_t

Unnamed: 0,Peptide Sequence,Prev AA,Next AA,Start,End,Peptide Length,Charges,Protein,Protein ID,Entry Name,Gene,Protein Description,Mapped Genes,Mapped Proteins,Dense_Dark_DD1 Spectral Count,Dense_Dark_DD2 Spectral Count,Dense_Dark_DD3 Spectral Count,Dense_Dark_DD4 Spectral Count,Dense_Dark_DD5 Spectral Count,Dense_Light_DD1 Spectral Count,Dense_Light_DD2 Spectral Count,Dense_Light_DD3 Spectral Count,Dense_Light_DD4 Spectral Count,Dense_Light_DD5 Spectral Count,Dense_Dark_DD1 Intensity,Dense_Dark_DD2 Intensity,Dense_Dark_DD3 Intensity,Dense_Dark_DD4 Intensity,Dense_Dark_DD5 Intensity,Dense_Light_DD1 Intensity,Dense_Light_DD2 Intensity,Dense_Light_DD3 Intensity,Dense_Light_DD4 Intensity,Dense_Light_DD5 Intensity,Dense_Dark_1,Dense_Dark_2,Dense_Dark_3,Dense_Dark_4,Dense_Dark_5,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Dense_Dark_DD1 Match Type,Dense_Dark_DD2 Match Type,Dense_Dark_DD3 Match Type,Dense_Dark_DD4 Match Type,Dense_Dark_DD5 Match Type,Dense_Light_DD1 Match Type,Dense_Light_DD2 Match Type,Dense_Light_DD3 Match Type,Dense_Light_DD4 Match Type,Dense_Light_DD5 Match Type,UniProt,id,Total missingness,Dense_Dark missingness,Dense_Light missingness,Dense_Dark/Dense_Light_scalar,Dense_Dark/Dense_Light,Dense_Dark/Dense_Light_pval,Dense_Dark/Dense_Light_adj-p,Protein description,Protein length,clean_pept,pept_start,pept_end,pept_type,Tryp Pept num,Semi Pept num,Dense_Dark/Dense_Light Sig Semi Pept num,Max absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Sum absFC of All Dense_Dark/Dense_Light Sig Semi Pept,Median absFC of All Dense_Dark/Dense_Light Sig Semi Pept,pept_id,lytic_group
Q31R01@AGEPGAEPR,AGEPGAEPR,R,E,64,72,9,2,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,1,0,0,0,0,0,0,0,0,0,34500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.035074,,,,,,,,,,MS/MS,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31R01,Q31R01@AGEPGAEPR,9,4,5,0.0,0.000000,,,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,AGEPGAEPR,64,72,Tryptic,17,10,1,0.244711,0.244711,0.244711,0064-0072@AGEPGAEPR,1
Q31R01@AGEPGAEPRETFASLAAAAAAGGFGR,AGEPGAEPRETFASLAAAAAAGGFGR,R,V,64,89,26,3,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,1,1,0,1,1,1,1,1,1,1,48000000.0,0.0,0.0,49900000.0,44300000.0,57500000.0,62200000.0,61700000.0,59600000.0,57200000.0,25.511512,,,25.534603,25.395785,25.875544,25.874337,25.862693,25.801764,25.809581,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@AGEPGAEPRETFASLAAAAAAGGFGR,2,2,0,0.0,-0.364151,0.000074,0.366815,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,AGEPGAEPRETFASLAAAAAAGGFGR,64,89,Tryptic,17,10,1,0.244711,0.244711,0.244711,0064-0089@AGEPGAEPRETFASLAAAAAAGGFGR,2
Q31R01@ARQGSLALR,ARQGSLALR,R,W,188,196,9,3,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,1,1,0,1,1,0,0,1,1,1,31000000.0,22000000.0,0.0,29100000.0,28300000.0,0.0,29900000.0,23800000.0,27400000.0,0.0,24.880746,24.408349,,24.756582,24.749280,,24.817568,24.488384,24.680627,,MS/MS,MS/MS,unmatched,MS/MS,MS/MS,unmatched,MBR,MS/MS,MS/MS,MS/MS,Q31R01,Q31R01@ARQGSLALR,3,1,2,0.0,0.036546,0.809922,1.000000,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ARQGSLALR,188,196,Tryptic,17,10,1,0.244711,0.244711,0.244711,0188-0196@ARQGSLALR,3
Q31R01@ETFASLAAAAAAGGFGR,ETFASLAAAAAAGGFGR,R,V,73,89,17,23,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31R01,Q31R01@ETFASLAAAAAAGGFGR,10,5,5,0.0,0.000000,,,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,ETFASLAAAAAAGGFGR,73,89,Tryptic,17,10,1,0.244711,0.244711,0.244711,0073-0089@ETFASLAAAAAAGGFGR,4
Q31R01@GLPITASVTWLHLLGNTADLTDYDPNLR,GLPITASVTWLHLLGNTADLTDYDPNLR,R,L,246,273,28,3,Q31R01_SYNE7,Q31R01,Q31R01_SYNE7,Synpcc7942_0486,Dihydroorotase,,,0,1,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,8695884.0,0.0,0.0,0.0,0.0,,,,,,23.150387,,,,,unmatched,MS/MS,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,unmatched,Q31R01,Q31R01@GLPITASVTWLHLLGNTADLTDYDPNLR,9,5,4,0.0,0.000000,,,tr|Q31R01|Q31R01_SYNE7 Dihydroorotase OS=Synec...,411,GLPITASVTWLHLLGNTADLTDYDPNLR,246,273,Tryptic,17,10,1,0.244711,0.244711,0.244711,0246-0273@GLPITASVTWLHLLGNTADLTDYDPNLR,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q31LS9@VTIADRYRELLR,VTIADRYRELLR,R,S,97,108,12,34,Q31LS9_SYNE7,Q31LS9,Q31LS9_SYNE7,Synpcc7942_1960,TIGR02652 family protein,,,1,0,2,0,0,0,1,0,0,0,33100000.0,0.0,39700000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.047565,,24.268259,,,,,,,,MS/MS,unmatched,MS/MS,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,Q31LS9,Q31LS9@VTIADRYRELLR,8,3,5,0.0,0.000000,,,tr|Q31LS9|Q31LS9_SYNE7 TIGR02652 family protei...,168,VTIADRYRELLR,97,108,Tryptic,1,0,0,,,,0097-0108@VTIADRYRELLR,1
Q31SB2@VVAVLPISPFASK,VVAVLPISPFASK,R,V,422,434,13,2,Q31SB2_SYNE7,Q31SB2,Q31SB2_SYNE7,Synpcc7942_0025,DUF1400 domain-containing protein,,,0,0,0,0,0,1,0,1,1,0,0.0,0.0,0.0,14000000.0,0.0,8162757.5,0.0,12300000.0,12700000.0,0.0,,,,23.700990,,23.059111,,23.536081,23.571280,,unmatched,unmatched,unmatched,MBR,unmatched,MS/MS,unmatched,MS/MS,MS/MS,unmatched,Q31SB2,Q31SB2@VVAVLPISPFASK,6,4,2,0.0,0.312166,0.444413,1.000000,tr|Q31SB2|Q31SB2_SYNE7 DUF1400 domain-containi...,568,VVAVLPISPFASK,422,434,Tryptic,1,0,0,,,,0422-0434@VVAVLPISPFASK,1
Q31N72@VYEPAQTAIGK,VYEPAQTAIGK,R,I,105,115,11,2,Q31N72_SYNE7,Q31N72,Q31N72_SYNE7,Synpcc7942_1467,Heat shock protein DnaJ-like,,,0,1,0,0,0,0,0,0,0,0,0.0,14500000.0,16400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,23.806899,23.918452,,,,,,,,unmatched,MS/MS,MBR,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,unmatched,Q31N72,Q31N72@VYEPAQTAIGK,8,3,5,0.0,0.000000,,,tr|Q31N72|Q31N72_SYNE7 Heat shock protein DnaJ...,229,VYEPAQTAIGK,105,115,Tryptic,1,0,0,,,,0105-0115@VYEPAQTAIGK,1
Q31PY8@VYLDSLFR,VYLDSLFR,R,Q,56,63,8,2,Q31PY8_SYNE7,Q31PY8,Q31PY8_SYNE7,Synpcc7942_0850,DUF2811 domain-containing protein,,,0,0,0,1,0,0,0,1,1,0,0.0,0.0,0.0,14000000.0,0.0,0.0,0.0,5263576.0,8365431.0,0.0,,,,23.700990,,,,22.311538,22.968963,,unmatched,unmatched,unmatched,MS/MS,unmatched,unmatched,unmatched,MS/MS,MS/MS,unmatched,Q31PY8,Q31PY8@VYLDSLFR,7,4,3,0.0,1.060739,0.313605,1.000000,tr|Q31PY8|Q31PY8_SYNE7 DUF2811 domain-containi...,66,VYLDSLFR,56,63,Tryptic,1,0,0,,,,0056-0063@VYLDSLFR,1


In [60]:
trypsin_prot

Unnamed: 0,Protein,Protein ID,Entry Name,Gene,Protein Length,Organism,Protein Existence,Description,Protein Probability,Top Peptide Probability,Combined Total Peptides,Combined Spectral Count,Combined Unique Spectral Count,Combined Total Spectral Count,Dense_Dark_T1 Spectral Count,Dense_Dark_T2 Spectral Count,Dense_Dark_T3 Spectral Count,Dense_Dark_T4 Spectral Count,Dense_Dark_T5 Spectral Count,Dense_Light_T1 Spectral Count,Dense_Light_T2 Spectral Count,Dense_Light_T3 Spectral Count,Dense_Light_T4 Spectral Count,Dense_Light_T5 Spectral Count,Dense_Dark_T1 Unique Spectral Count,Dense_Dark_T2 Unique Spectral Count,Dense_Dark_T3 Unique Spectral Count,Dense_Dark_T4 Unique Spectral Count,Dense_Dark_T5 Unique Spectral Count,Dense_Light_T1 Unique Spectral Count,Dense_Light_T2 Unique Spectral Count,Dense_Light_T3 Unique Spectral Count,Dense_Light_T4 Unique Spectral Count,Dense_Light_T5 Unique Spectral Count,Dense_Dark_T1 Total Spectral Count,Dense_Dark_T2 Total Spectral Count,Dense_Dark_T3 Total Spectral Count,Dense_Dark_T4 Total Spectral Count,Dense_Dark_T5 Total Spectral Count,Dense_Light_T1 Total Spectral Count,Dense_Light_T2 Total Spectral Count,Dense_Light_T3 Total Spectral Count,Dense_Light_T4 Total Spectral Count,Dense_Light_T5 Total Spectral Count,Dense_Dark_T1 Intensity,Dense_Dark_T2 Intensity,Dense_Dark_T3 Intensity,Dense_Dark_T4 Intensity,Dense_Dark_T5 Intensity,Dense_Light_T1 Intensity,Dense_Light_T2 Intensity,Dense_Light_T3 Intensity,Dense_Light_T4 Intensity,Dense_Light_T5 Intensity,Dense_Dark_1,Dense_Dark_2,Dense_Dark_3,Dense_Dark_4,Dense_Dark_5,Dense_Light_1,Dense_Light_2,Dense_Light_3,Dense_Light_4,Dense_Light_5,Indistinguishable Proteins,UniProt,id,Pept count,Total missingness,Dense_Dark missingness,Dense_Light missingness,Dense_Dark/Dense_Light,Dense_Dark/Dense_Light_pval,Dense_Dark/Dense_Light_adj-p,Dense_Dark/Dense_Light_scalar
O05161,RRP3_SYNE7,O05161,RRP3_SYNE7,mut3G,112,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable small ribosomal subunit protein cS23,1.0000,0.999,10,132,132,132,5,2,1,2,1,3,3,2,1,4,5,2,1,2,1,3,3,2,1,4,5,2,1,2,1,3,3,2,1,4,5.320000e+08,3.700000e+08,3.130000e+08,2.890000e+08,4.060000e+08,3.760000e+08,3.810000e+08,2.730000e+08,2.820000e+08,3.970000e+08,28.124510,27.616130,27.509095,27.403464,27.991231,27.741091,27.723474,27.258160,27.367849,27.805526,,O05161,O05161,10,0,0,0,0.149666,0.426459,1.0,0.0
O06865,AMPA_SYNE7,O06865,AMPA_SYNE7,pepA,486,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable cytosol aminopeptidase,1.0000,0.999,95,2318,2318,2318,115,120,118,103,107,118,115,103,107,130,115,120,118,103,107,118,115,103,107,130,115,120,118,103,107,118,115,103,107,130,2.760000e+10,2.820000e+10,2.810000e+10,2.560000e+10,2.410000e+10,2.980000e+10,2.430000e+10,2.830000e+10,2.500000e+10,2.520000e+10,31.359286,31.173871,31.238566,31.239164,31.243619,31.176113,31.125062,31.229807,31.139433,31.400955,,O06865,O06865,95,0,0,0,0.036627,0.547792,1.0,0.0
O06866,MOBA_SYNE7,O06866,MOBA_SYNE7,mobA,194,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Probable molybdenum cofactor guanylyltransferase,1.0000,0.999,3,32,32,32,1,2,2,2,2,3,1,1,2,0,1,2,2,2,2,3,1,1,2,0,1,2,2,2,2,3,1,1,2,0,3.470000e+07,6.290000e+07,8.970000e+07,7.160000e+07,3.920000e+07,7.320000e+07,2.550000e+07,2.200000e+07,5.890000e+07,0.000000e+00,24.200101,24.970588,25.122187,24.851778,24.429837,24.835748,,,24.934560,,,O06866,O06866,3,3,0,3,-0.170256,0.582785,1.0,0.0
O07345,CHLD_SYNE7,O07345,CHLD_SYNE7,chlD,677,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Magnesium-chelatase subunit ChlD,1.0000,0.999,19,112,112,112,8,11,11,10,8,9,6,7,9,4,8,11,11,10,8,9,6,7,9,4,8,11,11,10,8,9,6,7,9,4,1.870000e+08,3.090000e+08,2.930000e+08,2.460000e+08,2.360000e+08,3.050000e+08,2.100000e+08,2.300000e+08,2.490000e+08,4.620000e+07,25.458723,25.919057,26.033502,25.589683,25.637846,26.082454,25.639158,25.679232,25.799281,24.304892,,O07345,O07345,19,0,0,0,0.226759,0.507619,1.0,0.0
O32463,GSHB_SYNE7,O32463,GSHB_SYNE7,gshB,323,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Glutathione synthetase,1.0000,0.999,47,664,664,664,41,30,35,35,30,33,31,34,27,38,41,30,35,35,30,33,31,34,27,38,41,30,35,35,30,33,31,34,27,38,4.770000e+09,3.830000e+09,4.310000e+09,4.470000e+09,3.800000e+09,1.910000e+10,3.930000e+09,4.240000e+09,4.290000e+09,4.770000e+09,29.620558,29.478290,29.536858,29.601609,29.454259,29.340430,29.376123,29.622649,29.603138,29.887209,,O32463,O32463,47,0,0,0,-0.027595,0.797375,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9WWL7,Q9WWL7_SYNE7,Q9WWL7,Q9WWL7_SYNE7,gap3,333,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,Glyceraldehyde-3-phosphate dehydrogenase,0.9998,0.999,6,28,28,53,3,1,1,1,1,1,0,2,2,2,3,1,1,1,1,1,0,2,2,2,5,2,3,2,1,3,2,3,3,4,1.230000e+08,7.630000e+07,1.090000e+08,7.390000e+07,0.000000e+00,5.880000e+07,0.000000e+00,5.060000e+07,1.350000e+08,1.580000e+08,25.499190,25.979000,25.319224,25.947235,,,,,25.779585,26.195841,,Q9WWL7,Q9WWL7,6,4,1,3,-0.301551,0.337998,1.0,0.0
Q9WWQ0,Q9WWQ0_SYNE7,Q9WWQ0,Q9WWQ0_SYNE7,cpmA,260,Synechococcus elongatus (strain ATCC 33912 /,4:Protein predicted,Circadian phase modifier,1.0000,0.999,7,91,91,91,6,8,8,7,6,6,7,7,5,6,6,8,8,7,6,6,7,7,5,6,6,8,8,7,6,6,7,7,5,6,4.060000e+08,4.300000e+08,4.840000e+08,4.810000e+08,4.100000e+08,4.010000e+08,4.480000e+08,4.410000e+08,4.160000e+08,4.200000e+08,26.537058,26.487900,26.658105,26.653078,26.778238,26.467867,26.516538,26.588701,26.658105,26.625234,,Q9WWQ0,Q9WWQ0,7,0,0,0,0.051587,0.427893,1.0,0.0
Q9Z3G2,Q9Z3G2_SYNE7,Q9Z3G2,Q9Z3G2_SYNE7,cytM,145,Synechococcus elongatus (strain ATCC 33912 /,4:Protein predicted,CytM,1.0000,0.999,3,10,10,10,0,1,0,0,2,0,0,0,2,0,0,1,0,0,2,0,0,0,2,0,0,1,0,0,2,0,0,0,2,0,0.000000e+00,1.880000e+07,0.000000e+00,2.410000e+07,4.100000e+07,0.000000e+00,0.000000e+00,0.000000e+00,3.520000e+07,0.000000e+00,,,,,24.908570,,,,24.589063,,,Q9Z3G2,Q9Z3G2,3,8,4,4,0.319506,,,0.0
Q9Z3G5,Q9Z3G5_SYNE7,Q9Z3G5,Q9Z3G5_SYNE7,nblB,219,Synechococcus elongatus (strain ATCC 33912 /,3:Protein inferred from homology,PBS lyase HEAT-like repeat,1.0000,0.999,43,679,679,679,39,32,35,32,30,30,31,34,33,33,39,32,35,32,30,30,31,34,33,33,39,32,35,32,30,30,31,34,33,33,6.650000e+09,5.990000e+09,5.550000e+09,5.860000e+09,5.650000e+09,4.790000e+09,5.190000e+09,5.110000e+09,5.110000e+09,6.180000e+09,29.620558,29.573368,29.467385,29.504068,29.564834,29.296866,29.480460,29.358272,29.306807,29.977006,,Q9Z3G5,Q9Z3G5,43,0,0,0,0.062160,0.646240,1.0,0.0


In [57]:
#trypsin_pept = pd.read_csv("results/original_PnT1/Cyano/Cyano_lip_trypsin_pept_processed.tsv", sep='\t')
#trypsin_prot = pd.read_csv("results/original_PnT1/Cyano/Cyano_lip_trypsin_prot_processed.tsv", sep='\t')
#double_pept = pd.read_csv("results/original_PnT1/Cyano/Cyano_lip_double_pept_processed.tsv", sep='\t')

# Legacy code below

# Rolling up the site level

In [None]:
## Now apply these functions to the whole proteome
double_pept_t = []
double_pept_p = []
for uniprot_id in double_pept_w[uniprot_col].unique():
    pept_df = double_pept_w[double_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    double_pept_t.append(pept_df)
    pept_df_r = LiP_rollup_to_site(pept_df, int_cols, prot_seq, uniprot_col, uniprot_id = uniprot_id, peptide_col=peptide_col, rollup_func="median")
    if len(groups) > 2:
        pept_df_a = anova(pept_df_r, anova_cols, metadata)
        pept_df_a = anova(pept_df_r, anova_cols, metadata, anova_factors)
    pept_df_p = pairwise_ttest(pept_df_a, pairwise_ttest_groups)
    double_pept_p.append(pept_df_p)
double_pept_t = pd.concat(double_pept_t).copy()
double_pept_p = pd.concat(double_pept_p).copy()

In [None]:
trypsin_pept_t = []
trypsin_pept_p = []
for uniprot_id in trypsin_pept_w[uniprot_col].unique():
    pept_df = trypsin_pept_w[trypsin_pept_w[uniprot_col] == uniprot_id].copy()
    uniprot_seq = [prot_seq for prot_seq in prot_seqs if uniprot_id in prot_seq.id]
    if len(uniprot_seq) < 1:
        Warning(f"Protein {uniprot_id} not found in the fasta file. Skipping the protein.")
        continue
    elif len(uniprot_seq) > 1:
        Warning(f"Multiple proteins with the same ID {uniprot_id} found in the fasta file. Using the first one.")
    bio_seq = uniprot_seq[0]
    prot_seq = bio_seq.seq
    prot_desc = bio_seq.description
    pept_df = analyze_tryptic_pattern(pept_df, prot_seq, pairwise_ttest_groups, groups, description = prot_desc, keep_non_tryptic = True, peptide_col=peptide_col)
    trypsin_pept_t.append(pept_df)
    pept_df_r = LiP_rollup_to_site(pept_df, int_cols, prot_seq, uniprot_col, uniprot_id = uniprot_id, peptide_col=peptide_col, rollup_func="median")
    if len(groups) > 2:
        pept_df_a = anova(pept_df_r, anova_cols, metadata)
        pept_df_a = anova(pept_df_r, anova_cols, metadata, anova_factors)
    pept_df_p = pairwise_ttest(pept_df_a, pairwise_ttest_groups)
    trypsin_pept_p.append(pept_df_p)
trypsin_pept_t = pd.concat(trypsin_pept_t).copy()
trypsin_pept_p = pd.concat(trypsin_pept_p).copy()