File links and threshold values

In [None]:
list_link= 'https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/Supplementary_Table_S3.xlsx'
sample_sequence_link='https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/data/sequence_samples.xlsx'
model_link='https://github.com/SMBP-lab/Body-fluid-identification-proteomics/raw/main/best_model_random_forest.joblib'

Packages importation

In [None]:
import pandas as pd
import numpy as np
import joblib
import requests

Loading files

In [None]:
data=pd.read_excel(sample_sequence_link)
data=data.loc[data.loc[:,'Sample type']=='Sample',:]

Preparation of data file

In [None]:
def preparation_accession(data):
    # Remplace empty data by nan
    data_bis = data.replace("", float("nan"))
    # Removing empty lines
    data_bis = data_bis.dropna(how='all')
    # Removing spaces at the begining and the end of each cell
    data_bis = data_bis.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    #Changing \ by /
    data_bis['File path'] = data_bis['File path'].str.replace('\\', '/')
    #Building file access links
    data_bis["Accession"]=data_bis["File path"]+"/"+data_bis["File name"]+"_PeptideGroups.txt"
    return data_bis
data=preparation_accession(data)

Preparation of peptide list file

In [None]:
list_pep=pd.read_excel(list_link,engine='openpyxl')
list_pep['Modifications']=list_pep['Modifications'].replace(0,np.nan)
list_pep['Seq_modif'] = list_pep.apply(lambda row: row['Annotated Sequence'] + row['Modifications'] if not pd.isna(row['Modifications']) else row['Annotated Sequence'], axis=1) 
list_pep=list_pep.set_index('Seq_modif')

Construction of a result file with all samples and all peptides of peptide list

In [None]:
def construction_file_result (sequence,list_pep):
    result=list_pep.copy()
    for ind in sequence.index:
        name=sequence.loc[ind,"Sample name"]
        acces=sequence.loc[ind,'Accession']
        file_peptide_groups=pd.read_table(acces)
        file_peptide_groups['Seq_modif'] = file_peptide_groups.apply(lambda row: row['Annotated Sequence'] + row['Modifications'] if not pd.isna(row['Modifications']) else row['Annotated Sequence'], axis=1)
        file_peptide_groups=file_peptide_groups.set_index("Seq_modif")
        file_peptide_groups=file_peptide_groups.filter(like='Abundance')
        file_peptide_groups.columns=["Abundance_"+name]
        result=pd.concat([result,file_peptide_groups.reindex(result.index).fillna(0)],axis=1)
    return(result)
result_tot=construction_file_result(data,list_pep)

Uploading RF trained model

In [None]:
# Uploading model file
response = requests.get(model_link)
with open('model.joblib', 'wb') as file:
    file.write(response.content)
model=joblib.load('model.joblib')

Predictions

In [None]:
fluids=['Blood','Saliva','Semen','Urine','VGF']
Y_pred=model.predict(result_tot.filter(like='Abundance_').transpose())
result_test=pd.DataFrame(data=Y_pred,columns=fluids,index=data['Sample name'])