Many assays contained a small amount of proteins, insufficient to represent a cell line proteome. These are originated from fractionated samples loaded separately on the LC-MS/MS and thus giving unique raw-files for each fraction. These are pooled here after manual annotation of these fractions (shown in *Format_pool.csv*), so every assay represents a full proteome representation of a cell line.

In [1]:
import pandas as pd
import mysql.connector
import numpy as np

In [2]:
conn = mysql.connector.connect(user='root', password='password', host='127.0.0.1', port='3306',database='expression_atlas_cells')
mycursor = conn.cursor(buffered = True)

# check the connection
if conn.is_connected():
    print("connection succesfull")
else:
    print("no connection")

connection succesfull


In [3]:
assaysql = "SELECT assay_id, peptide_id, quantification FROM peptide_to_assay"
assayData = pd.read_sql_query(assaysql, conn)
assayData.head()

Unnamed: 0,assay_id,peptide_id,quantification
0,31000,110730450,1.0
1,31002,110730450,3.0
2,31003,110730450,2.0
3,31048,110730450,2.0
4,31049,110730450,2.0


Get all peptide to protein relations

In [4]:
pepsql = "SELECT peptide_to_protein.peptide_id, peptide_to_protein.uniprot_id FROM peptide_to_protein"
pepData = pd.read_sql_query(pepsql, conn)
pepData.head()

Unnamed: 0,peptide_id,uniprot_id
0,110745848,A0A024RBG1
1,112000279,A0A024RBG1
2,112950201,A0A075B6I1
3,115802737,A0A075B6R2
4,113028442,A0A075B6R9


Get sequence length for all proteins in the database

In [5]:
seqsql = "SELECT uniprot_id, length FROM protein WHERE length IS NOT NULL"
seqData = pd.read_sql_query(seqsql, conn)
seqData["length"] = pd.to_numeric(seqData['length'], errors = "coerce")
seqData.head()

Unnamed: 0,uniprot_id,length
0,A0A024RBG1,181.0
1,A0A075B6I1,120.0
2,A0A075B6R2,117.0
3,A0A075B6R9,120.0
4,A0A075B6V5,113.0


Select proteotypic peptides (peptide with 1 peptide to protein relation)

In [6]:
#Select proteotypic peptides
proteotypicData = pepData.groupby("peptide_id").filter(lambda x: len(x) == 1)
proteotypicData.head()

Unnamed: 0,peptide_id,uniprot_id
0,110745848,A0A024RBG1
1,112000279,A0A024RBG1
2,112950201,A0A075B6I1
3,115802737,A0A075B6R2
4,113028442,A0A075B6R9


Select proteins which have more than 2 proteotypic peptides

In [7]:
#only select proteins with at least 2 proteotypic peptides
proteins = proteotypicData.groupby("uniprot_id").filter(lambda x: len(x) > 2)
proteins.shape

(160681, 2)

drop non human proteins

In [8]:
non_human_proteins = non_human = """ADH1_YEAST,,,
ALBU_BOVIN,,,
ALDOA_RABIT,,,
BGAL_ECOLI,,,
CAH2_BOVIN,,,
CAS1_BOVIN,,,
CAS2_BOVIN,,,
CASB_BOVIN,,,
CASK_BOVIN,,,
CYC_HORSE,,,
DHE3_BOVIN,,,
GAG_SCVLA,,,
GFP_AEQVI,,,
K1C15_SHEEP,,,
K1M1_SHEEP,,,
K1M2_SHEEP,,,
K2M1_SHEEP,,,
K2M2_SHEEP,,,
K2M3_SHEEP,,,
KRA3_SHEEP,,,
KRA61_SHEEP,,,
LALBA_BOVIN,,,
LYSC_CHICK,,,
LYSC_LYSEN,,,
MYG_HORSE,,,
REF_HEVBR,,,
SRPP_HEVBR,,,
TRY1_BOVIN,,,
TRYP_PIG""".split(",,,\n")
proteins = proteins[~proteins["uniprot_id"].isin(non_human_proteins)]

Merge assays containing spectral counts and proteins

In [9]:
#Merge assay table with spectral counts and table with proteins
protData = pd.merge(assayData, proteins, on = "peptide_id").sort_values(["assay_id", "uniprot_id"])
del protData['peptide_id']
protData.head(10)

Unnamed: 0,assay_id,quantification,uniprot_id
797770,30960,1.0,A0A1B0GUS4
813680,30960,2.0,A0AVT1
1651976,30960,3.0,A0AVT1
1681511,30960,2.0,A0MZ66
1890952,30960,1.0,A1L170
1665418,30960,2.0,A2RRP1
1889165,30960,1.0,A2RUC4
1889242,30960,1.0,A3KMH1
1667244,30960,2.0,A4D1E9
1841187,30960,3.0,A6NCS6


Now add the manually curated excel file with following characteristics:<br>
- contains assay_id, PXD_accession and new label, "pool_id".
    - 0 = no pooling needed
    - no_annotation = drop it
    - x = unique integer within a project which represents the pool.

In [None]:
#Create excel file
query = "SELECT assay_id, project_id, filename"

In [10]:
pool_formatting = pd.read_csv("Format_pool.csv", sep = ";")
pool_formatting = pool_formatting.loc[:,["assay_id", "PXD_accession", "pool_id"]]
pool_formatting = pool_formatting[~(pool_formatting.pool_id == "no_annotation")]
pool_formatting.pool_id = pool_formatting.pool_id.astype(int)
pool_formatting.reset_index(drop = True, inplace = True)
pool_formatting

Unnamed: 0,assay_id,PXD_accession,pool_id
0,30960,PXD000533,1
1,30961,PXD000533,2
2,30962,PXD000533,1
3,30963,PXD000533,1
4,30964,PXD000533,2
...,...,...,...
812,32907,PXD017391,0
813,32908,PXD017391,0
814,32909,PXD017391,0
815,32910,PXD017391,0


Select files which need no pooling

In [11]:
no_pool = pool_formatting[pool_formatting.pool_id == 0].assay_id.tolist()

Group the files that need pooling for each project

In [12]:
#list in list of pools
pools = pool_formatting[~pool_formatting.assay_id.isin(no_pool)].groupby(["PXD_accession", "pool_id"]).apply(lambda x: list(x.assay_id)).tolist()

Split data per assay or group of assays

In [13]:
DataFrameDict_no_pool = {elem: pd.DataFrame for elem in no_pool}
for key in DataFrameDict_no_pool.keys():
    DataFrameDict_no_pool[key] = protData[:][protData["assay_id"] == key]

In [14]:
DataFrameDict_pooled = {pool[0]: pd.DataFrame for pool in pools}
for pool in pools:
    DataFrameDict_pooled[pool[0]] = protData[:][protData.assay_id.isin(pool)]

In [15]:
print(f"No pool assays: {len(DataFrameDict_no_pool)}\nPooled assays: {len(DataFrameDict_pooled)}")
DataFrameDict_no_pool.update(DataFrameDict_pooled)
print(f"Concatened length: {len(DataFrameDict_no_pool)}")

No pool assays: 272
Pooled assays: 46
Concatened length: 318


In [16]:
DataFramaDict2 = DataFrameDict_no_pool.copy()

Calculate NSAF for each protein

In [17]:
for count, key in enumerate(DataFramaDict2.keys()):
    sumSaf = 0
    assay = DataFramaDict2[key]
    assay.pop("assay_id")

    #calculate sum of spectral counts for each protein
    grouped = DataFramaDict2[key].groupby("uniprot_id").sum().reset_index()
    seqAddedDF = pd.merge(grouped, seqData, on = "uniprot_id")
    seqAddedDF.insert(loc = 2, column = 'SAF', value = 0)
    seqAddedDF.insert(loc = 3, column = 'NSAF', value = 0)
    
    #Calculate SAF score for each protein by dividing sum of spectral counts by protein length
    for index, row in seqAddedDF.iterrows():
        saf = row['quantification']/row['length']
        seqAddedDF.loc[index, 'SAF'] = saf
        # calculate sum of SAF scores in assay
        sumSaf += saf

    # Calculate NSAF score by normalizing each SAF score
    seqAddedDF["NSAF"] = seqAddedDF["SAF"] / sumSaf
    
    del seqAddedDF['length']
    del seqAddedDF['quantification']
    del seqAddedDF['SAF']
    seqAddedDF.insert(loc = 0, column = 'assay_id', value = key)
    DataFramaDict2[key] = seqAddedDF

In [18]:
proteinData = pd.DataFrame()

for key in DataFramaDict2.keys():
    proteinData = proteinData.append(DataFramaDict2[key])

len(proteinData.assay_id.unique())

318

In [19]:
proteinData.head()

Unnamed: 0,assay_id,uniprot_id,NSAF
0,31098,A0AVT1,0.000493
1,31098,A1L020,0.000664
2,31098,A5YKK6,9.7e-05
3,31098,A6NGN9,0.000171
4,31098,O00264,0.001772


In [20]:
#write NSAF proteome to file
proteinData.to_csv(path_or_buf = 'pooled_proteome_nsaf_1.csv', index = False)
df = pd.read_csv('pooled_proteome_nsaf_1.csv')
df.head()

Unnamed: 0,assay_id,uniprot_id,NSAF
0,31098,A0AVT1,0.000493
1,31098,A1L020,0.000664
2,31098,A5YKK6,9.7e-05
3,31098,A6NGN9,0.000171
4,31098,O00264,0.001772
