File links and threshold values

In [None]:
list_link= 'https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/Supplementary_Table_S2.xlsx'
sample_sequence_link='https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/data/sequence_samples.xlsx'

#Threshold defined from ROC curve
percentage_blood=44
percentage_saliva=20
percentage_semen=4
percentage_urine=8
percentage_vgf=40

Packages importation

In [None]:
import pandas as pd

Loading files

In [None]:
data=pd.read_excel(sample_sequence_link)
data=data.loc[data.loc[:,'Sample type']=='Sample',:]
list_percentage=pd.Series(data=[percentage_blood,percentage_saliva,percentage_semen,percentage_urine,percentage_vgf],index=['Blood','Saliva','Semen','Urine','VGF'])

Preparation of data file

In [None]:
def preparation_accession(data):
    # Remplace empty data by nan
    data_bis = data.replace("", float("nan"))
    # Removing empty lines
    data_bis = data_bis.dropna(how='all')
    # Removing spaces at the begining and the end of each cell
    data_bis = data_bis.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    #Changing \ by /
    data_bis['File path'] = data_bis['File path'].str.replace('\\', '/')
    #Building file access links
    data_bis["Accession"]=data_bis["File path"]+"/"+data_bis["File name"]+"_PeptideGroups.txt"
    return data_bis
data=preparation_accession(data)

Preparation of ratio list file

In [None]:
list_ratios=pd.read_excel(list_link,engine='openpyxl')
list_ratios['Seq_modif pep A']= list_ratios.apply(lambda row: row['Annotated Sequence of peptide A'] + row['Modifications of peptide A'] if not pd.isna(row['Modifications of peptide A']) else row['Annotated Sequence of peptide A'], axis=1)
list_ratios['Seq_modif pep B']= list_ratios.apply(lambda row: row['Annotated Sequence of peptide B'] + row['Modifications of peptide B'] if not pd.isna(row['Modifications of peptide B']) else row['Annotated Sequence of peptide B'], axis=1)

Construction of a result file with all samples and all peptides of peptide list

In [None]:
def construction_file_result (sequence,list_ratios):
    result=list_ratios.copy()
    for ind in sequence.index:
        name=sequence.loc[ind,"Sample name"]
        acces=sequence.loc[ind,'Accession']
        file_peptide_groups=pd.read_table(acces)
        file_peptide_groups['Seq_modif'] = file_peptide_groups.apply(lambda row: row['Annotated Sequence'] + row['Modifications'] if not pd.isna(row['Modifications']) else row['Annotated Sequence'], axis=1)
        file_peptide_groups=file_peptide_groups.set_index("Seq_modif")
        file_peptide_groups=file_peptide_groups.filter(like='Abundance')
        columns=pd.Series(index=result.index,name='Ratios '+name)
        for r in columns.index:
            if list_ratios.loc[r,'Seq_modif pep A'] in file_peptide_groups.index and list_ratios.loc[r,'Seq_modif pep B'] in file_peptide_groups.index:
                columns.loc[r]=file_peptide_groups.loc[list_ratios.loc[r,'Seq_modif pep A'],:].values/file_peptide_groups.loc[list_ratios.loc[r,'Seq_modif pep B'],:].values
            else:
                  columns[r]=0
        result=pd.concat([result,columns],axis=1)
    return(result)
result_tot=construction_file_result(data,list_ratios)

Test fonction

In [None]:
fluids=['Blood','Saliva','Semen','Urine','VGF']
def Test_fluid_ratios(result_tot,data,):
    names=data['Sample name']
    results_test=pd.DataFrame(index=['Blood','Saliva','Semen','Urine','VGF'])
    for ech in names:
        for fluid in fluids:
            list_fluids=list_ratios.loc[list_ratios['Associated body fluid']==fluid,:]
            pos=0
            for ind in list_fluids.index:
                if result_tot.loc[ind, 'Ratios '+ech]>list_ratios.loc[ind,"Upper bound of the 99.99"+"%"+" confidence interval for other fluids"] and list_ratios.loc[ind,'Mean ratio A/B for associated body fluid']>list_ratios.loc[ind,'Mean ratio A/B for other fluids']:
                    pos+=1
                if result_tot.loc[ind, 'Ratios '+ech]>list_ratios.loc[ind,"Lower bound of the 99.99"+"%"+" confidence interval for other fluids"] and list_ratios.loc[ind,'Mean ratio A/B for associated body fluid']<list_ratios.loc[ind,'Mean ratio A/B for other fluids']:
                    pos+=1
            results_test.loc[fluid,ech]=pos/list_fluids.shape[0]
    return(results_test)

result_test=Test_fluid_ratios(result_tot,data)

In [None]:
result=result_test.transpose()*100
for fluid in fluids:
    result['Conclusion '+fluid]=0

def decision_presence_fluide (line_result,list_percentage):
    for fluid in fluids:
        if line_result.loc[fluid]>=list_percentage[fluid]:
            line_result.loc['Conclusion '+fluid]=1
    return(line_result)

for ind in result.index:
    result.loc[ind,:]=decision_presence_fluide(result.loc[ind],list_percentage)

result