File links and threshold values

In [None]:
list_link= 'https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/Supplementary_Table_S4.xlsx'
sample_sequence_link='https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/data/sequence_samples.xlsx'

#Threshold defined from ROC curve
percentage_blood=26.31578947368421
percentage_saliva=13.46153846153846
percentage_semen=32.69230769230769
percentage_urine=3
percentage_vgf=28.57142857142857

Packages importation

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
from openpyxl import Workbook
from openpyxl.styles import PatternFill

Loading files

In [None]:
data=pd.read_excel(sample_sequence_link)
data=data.loc[data.loc[:,'Sample type']=='Sample',:]
list_percentage=pd.Series(data=[percentage_blood,percentage_saliva,percentage_semen,percentage_urine,percentage_vgf],index=['Blood','Saliva','Semen','Urine','VGF'])

Preparation of data file

In [None]:
def preparation_accession(data):
    # Remplace empty data by nan
    data_bis = data.replace("", float("nan"))
    # Removing empty lines
    data_bis = data_bis.dropna(how='all')
    # Removing spaces at the begining and the end of each cell
    data_bis = data_bis.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    #Changing \ by /
    data_bis['File path'] = data_bis['File path'].str.replace('\\', '/')
    #Building file access links
    data_bis["Accession"]=data_bis["File path"]+"/"+data_bis["File name"]+"_PeptideGroups.txt"
    return data_bis
data=preparation_accession(data)

Preparation of peptide list file

In [None]:
list_pep=pd.read_excel(list_link,engine='openpyxl')
list_pep['Modifications']=list_pep['Modifications'].replace(0,np.nan)
list_pep['Seq_modif'] = list_pep.apply(lambda row: row['Annotated Sequence'] + row['Modifications'] if not pd.isna(row['Modifications']) else row['Annotated Sequence'], axis=1) 
list_pep=list_pep.set_index('Seq_modif')

Construction of a result file with all samples and all peptides of peptide list

In [None]:
def construction_file_result (sequence,list_pep):
    result=list_pep.copy()
    for ind in sequence.index:
        name=sequence.loc[ind,"Sample name"]
        acces=sequence.loc[ind,'Accession']
        file_peptide_groups=pd.read_table(acces)
        file_peptide_groups['Seq_modif'] = file_peptide_groups.apply(lambda row: row['Annotated Sequence'] + row['Modifications'] if not pd.isna(row['Modifications']) else row['Annotated Sequence'], axis=1)
        file_peptide_groups=file_peptide_groups.set_index("Seq_modif")
        file_general=pd.concat([file_peptide_groups.filter(like='Master Protein Accessions'),file_peptide_groups.filter(like='Annotated Sequence'),file_peptide_groups['Modifications']],axis=1)
        file_peptide_groups=pd.concat([file_peptide_groups.filter(like='PSM'),file_peptide_groups.filter(like='Abundance')],axis=1)
        file_peptide_groups.columns=["PSMs_"+name,"Abundance_"+name]
        abundance_tot=file_peptide_groups.loc[:,'Abundance_'+name].sum(skipna=True)
        if abundance_tot!=0:
            file_peptide_groups["Normalized_abundance_"+name]=file_peptide_groups['Abundance_'+name]/abundance_tot
        else:
            file_peptide_groups["Normalized_abundance_"+name]=0
        result=pd.concat([result,file_peptide_groups.reindex(result.index).fillna(0)],axis=1)
    return(result)
result_tot=construction_file_result(data,list_pep)

Test fonction

In [None]:
fluids=['Blood','Saliva','Semen','Urine','VGF']
def Test_fluid_list(result_tot,data,):
    names=data['Sample name']
    results_test=pd.DataFrame(index=['Blood','Saliva','Semen','Urine','VGF'])
    for ech in names:
        for fluide in fluids:
            list_fluids=list_pep.loc[list_pep['Associated body fluid']==fluide,:]
            pos=0
            for ind in list_fluids.index:
                if result_tot.loc[ind, 'Abundance_'+ech]>0:
                    pos+=1
            results_test.loc[fluide,ech]=pos/list_fluids.shape[0]
    return(results_test)

result_test=Test_fluid_list(result_tot,data)
result_test

In [None]:
result=result_test.transpose()*100
for fluid in fluids:
    result['Conclusion '+fluid]=0

def decision_presence_fluide (line_result,list_percentage):
    for fluid in fluids:
        if line_result.loc[fluid]>=list_percentage[fluid]:
            line_result.loc['Conclusion '+fluid]=1
    return(line_result)

for ind in result.index:
    result.loc[ind,:]=decision_presence_fluide(result.loc[ind],list_percentage)

result