In [1]:
# imports
import os
import pathlib
import pandas as pd
import cobra

In [2]:
# get current directory
current_dir = pathlib.Path(os.getcwd())

In [3]:
# ****************************************************************************************************
# Step 6:   Calculating protein concentrations from LC-MS/MS data 
# ****************************************************************************************************

# inputs

# path to the protein measurement data
data_path = current_dir.parent / "datasets" / "A735_protein_data_CCM_PRM_raw_data.xlsx"
# path to Human1 SBML
human1_path = current_dir.parent / "models" / "Human-GEM.xml"
# path to ftINIT model SBML
model_path = current_dir.parent / "models" / "A375_ftINIT_1+0_thr100_model.xml"
# output file path
output_path = data_path.parent / "A735_protein_data_prep.tsv"

In [4]:
# Read the first sheet of the protein measurement Excel file into a DataFrame
all_prot = pd.read_excel(data_path)

In [5]:
print(f"Number of measured Proteins: {all_prot["Protein Accession"].nunique()}")

Number of measured Proteins: 82


In [6]:
# filter for data from the primary tumor
primary_prot = all_prot.loc[all_prot["Condition"] == "SQ"]

In [7]:
# create dataframe that maps sample peptides and peptide standards next to each other
primary_comparison = pd.concat(
    [
        primary_prot.loc[primary_prot["Isotope Label Type"] == "light", ["Protein Accession", "Protein", "Peptide Sequence", "BioReplicate", "Total Area"]].reset_index(drop=True), 
        primary_prot.rename(columns={"Total Area": "Heavy Spike Area"}).loc[primary_prot["Isotope Label Type"] == "heavy", ["Heavy Spike Area"]].reset_index(drop=True)
    ], 
    axis=1
    )

In [8]:
# calculate quotient of sample and standard measurements
primary_comparison["Relative Protein Content"] = primary_comparison["Total Area"] / primary_comparison["Heavy Spike Area"]

# calculate the absolute protein content based on peptide standard concentrations
# sample weight: 10 ng = 10 * 10e-9 g =  0.00000001 g
# amount of heavy peptide: 50 fmol = 50 * 10e-12 mmol = 0.00000000005 mmol 
primary_comparison["Absolute Protein Content [mmol/gDW]"] = primary_comparison["Relative Protein Content"] * 0.00000000005 / 0.00000001
primary_comparison.head()

Unnamed: 0,Protein Accession,Protein,Peptide Sequence,BioReplicate,Total Area,Heavy Spike Area,Relative Protein Content,Absolute Protein Content [mmol/gDW]
0,P52209,6PGD_HUMAN,VDDFLANEAK,1.0,1991927.0,1223529000.0,0.001628,8e-06
1,P52209,6PGD_HUMAN,VDDFLANEAK,2.0,2491982.0,945659300.0,0.002635,1.3e-05
2,P52209,6PGD_HUMAN,VDDFLANEAK,3.0,2275862.0,1091832000.0,0.002084,1e-05
3,P52209,6PGD_HUMAN,VDDFLANEAK,1.0,1991927.0,1223529000.0,0.001628,8e-06
4,P52209,6PGD_HUMAN,VDDFLANEAK,2.0,2491982.0,945659300.0,0.002635,1.3e-05


In [9]:
# calculate the average concentration of peptides associated with each protein
primary_conc = primary_comparison.groupby(["Protein Accession"])["Absolute Protein Content [mmol/gDW]"].agg(['mean', 'std']).reset_index()
primary_conc.head()

Unnamed: 0,Protein Accession,mean,std
0,O00330,9e-06,5.308544e-06
1,O00757,4e-06,2.432937e-06
2,O14556,2e-06,2.998146e-07
3,O43837,8e-06,1.889814e-06
4,O75390,2.2e-05,2.217726e-05


In [10]:
# loading Human1 model for protein-gene-mapping
ihuman = cobra.io.read_sbml_model(human1_path)

In [11]:
# retrieving protein accession numbers & gene ids
# and calculating number for synonyms
gene_id_mapping = {}
num_synonyms = 0
for gene in ihuman.genes:
    if isinstance(gene.annotation["uniprot"], list):
        for synonym in gene.annotation["uniprot"]:
            gene_id_mapping[synonym] = gene.id
            num_synonyms += 1
        num_synonyms -= 1 # to count only additional synonyms
    else:
        gene_id_mapping[gene.annotation["uniprot"]] = gene.id

print(f"Number of proteins (with protein accession) in Human-GEM: {len(gene_id_mapping.keys()) - num_synonyms}")
measured_proteins = primary_comparison["Protein Accession"].unique()
print(f"Number of proteins measuered: {len(measured_proteins)}")
measured_proteins_model = set(gene_id_mapping.keys()).intersection(set(measured_proteins))
print(f'Number of measured proteins present in the model: {len(measured_proteins_model)}')

Number of proteins (with protein accession) in Human-GEM: 2875
Number of proteins measuered: 82
Number of measured proteins present in the model: 80


In [12]:
# load A375 model to filter for genes in tissue-specific model
A375_ftINIT = cobra.io.read_sbml_model(model_path)
A375_model_genes = []
for gene in A375_ftINIT.genes:
    A375_model_genes.append(gene.id)

In [15]:
# save the protein concentrations as tsv files for copy-pasting into autopacmen input file
model_conc = primary_conc.loc[primary_conc["Protein Accession"].isin(measured_proteins_model)]
model_conc["Gene ID"] = [gene_id_mapping[accession_number] for accession_number in model_conc["Protein Accession"]]
print(len(model_conc))
final_conc = model_conc.loc[model_conc["Gene ID"].isin(A375_model_genes)]
print(len(final_conc))
final_conc.rename(columns={"mean": "Protein Concentration"}).to_csv(output_path, sep="\t", index=False, float_format="%.15g")
final_conc.rename(columns={"mean": "Protein Concentration"})

80
79


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_conc["Gene ID"] = [gene_id_mapping[accession_number] for accession_number in model_conc["Protein Accession"]]


Unnamed: 0,Protein Accession,Protein Concentration,std,Gene ID
0,O00330,8.786247e-06,5.308544e-06,ENSG00000110435
1,O00757,4.277368e-06,2.432937e-06,ENSG00000130957
2,O14556,2.261984e-06,2.998146e-07,ENSG00000105679
3,O43837,7.776139e-06,1.889814e-06,ENSG00000101365
4,O75390,2.204465e-05,2.217726e-05,ENSG00000062485
...,...,...,...,...
77,Q9NQR9,1.795760e-04,2.759857e-04,ENSG00000152254
78,Q9NR19,2.758456e-06,9.208418e-07,ENSG00000131069
79,Q9NUB1,1.149030e-05,9.065110e-06,ENSG00000154930
80,Q9P2R7,1.254821e-05,4.057558e-06,ENSG00000136143
