## Create the proteome NSAF of all proteins not connected to the tissue 

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import sqlite3
import mysql.connector

In [2]:
conn = mysql.connector.connect(user='root', password='password', host='127.0.0.1', port='3306',database='expression_atlas_cells2', auth_plugin='mysql_native_password')
mycursor = conn.cursor()

# check the connection
if conn.is_connected():
    print("connection succesfull")
else:
    print("no connection")


connection succesfull


In [3]:
assaysql = "SELECT assay_id, peptide_id, quantification FROM peptide_to_assay"
assayData = pd.read_sql_query(assaysql, conn)
assayData.head()

Unnamed: 0,assay_id,peptide_id,quantification
0,1,1,1.0
1,2,1,1.0
2,3,1,1.0
3,4,1,2.0
4,5,1,1.0


Get all peptide to protein relations

In [4]:
pepsql = "SELECT peptide_to_protein.peptide_id, peptide_to_protein.uniprot_id FROM peptide_to_protein"
pepData = pd.read_sql_query(pepsql, conn)
pepData.head()

Unnamed: 0,peptide_id,uniprot_id
0,3406663,A0A024RBG1
1,4461690,A0A024RBG1
2,4555865,A0A075B6H7
3,1187159,A0A075B6H8
4,1122869,A0A075B6I1


Get sequence length for all proteins in database

In [5]:
seqsql = "SELECT uniprot_id, length FROM protein WHERE length IS NOT NULL"
seqData = pd.read_sql_query(seqsql, conn)
#seqData

In [6]:
seqData['length'] = pd.to_numeric(seqData['length'], errors='coerce')

In [7]:
seqData.head()

Unnamed: 0,uniprot_id,length
0,A0A024RBG1,181.0
1,A0A075B6H7,116.0
2,A0A075B6H8,117.0
3,A0A075B6I1,120.0
4,A0A075B6K6,122.0


Select proteotypic peptides: peptides which have only 1 peptide-protein relation

In [8]:
proteotypicData = pepData.groupby("peptide_id").filter(lambda x: len(x) == 1)
#proteotypicData

Protein inference: Only keep proteins which have at least 3 proteotypic peptides

In [9]:
proteins = proteotypicData.groupby("uniprot_id").filter(lambda x: len(x) > 2)
proteins.shape

(124609, 2)

In [10]:
non_human_proteins = ['ADH1_YEAST', 'ALBU_BOVIN', 'ALDOA_RABIT', 'BGAL_ECOLI', 'CAH2_BOVIN', 'CAS1_BOVIN', 'CAS2_BOVIN', 'CASB_BOVIN', 'CASK_BOVIN', 'CYC_HORSE', 'DHE3_BOVIN', 'GAG_SCVLA', 'GFP_AEQVI', 'K1C15_SHEEP', 'K1M1_SHEEP', 'K1M2_SHEEP', 'K2M1_SHEEP', 'K2M2_SHEEP', 'K2M3_SHEEP', 'KRA3_SHEEP', 'KRA61_SHEEP', 'LALBA_BOVIN', 'LYSC_CHICK', 'LYSC_LYSEN', 'MYG_HORSE', 'REF_HEVBR', 'SRPP_HEVBR', 'TRY1_BOVIN', 'TRYP_PI', 'Q9BZD3','TRYP_PIG', 'TRY2_BOVIN','TRY1_BOVIN','SSPA_STAAU','SRPP_HEVBR','REF_HEVBR', 'ADH1_YEAST', 'ALBU_BOVIN', 'CAS1_BOVIN', 'CAS2_BOVIN', 'CASK_BOVIN', 'CASB_BOVIN', 'OVAL_CHICK', 'ALDOA_RABIT', 'BGAL_ECOLI', 'CAH2_BOVIN', 'CTRA_BOVIN', 'CTRB_BOVIN', 'CYC_HORSE', 'DHE3_BOVIN', 'GAG_SCVLA', 'GFP_AEQVI', 'K1C15_SHEEP', 'K1M1_SHEEP', 'K2M2_SHEEP', 'K2M3_SHEEP', 'KRA3A_SHEEP', 'KRA3_SHEEP', 'KRA61_SHEEP', 'LALBA_BOVIN', 'LYSC_CHICK', 'LYSC_LYSEN', 'MYG_HORSE', 'K1M2_SHEEP', 'K2M1_SHEEP']

In [11]:
proteins = proteins[~proteins['uniprot_id'].isin(non_human_proteins)]

Merge assays containing spectral counts and proteins

In [12]:
protData = pd.merge(assayData, proteins, on = 'peptide_id').sort_values(['assay_id','uniprot_id'])
protData.head(50)
#protData

Unnamed: 0,assay_id,peptide_id,quantification,uniprot_id
165624,1,899,1.0,A0A1B0GUS4
1204271,1,28806,1.0,A0A1B0GUS4
100099,1,506,1.0,A0AVT1
972576,1,9889,1.0,A0AVT1
223475,1,1336,1.0,A6NDG6
870685,1,7004,1.0,A6NDG6
886103,1,7238,1.0,A6NDG6
47219,1,203,2.0,A6NHL2
534668,1,2926,1.0,A6NHQ2
849469,1,6669,1.0,A6NHR9


In [13]:
del protData['peptide_id']

Split data per assay

In [14]:
assays = protData['assay_id'].unique()
DataFrameDict = {elem : pd.DataFrame for elem in assays}
for key in DataFrameDict.keys():
    DataFrameDict[key] = protData[:][protData['assay_id'] == key]

Calculate NSAF score for each protein per assay

In [15]:
DataFrameDict2 = DataFrameDict.copy()

In [16]:
seqData

Unnamed: 0,uniprot_id,length
0,A0A024RBG1,181.0
1,A0A075B6H7,116.0
2,A0A075B6H8,117.0
3,A0A075B6I1,120.0
4,A0A075B6K6,122.0
...,...,...
11360,Q9Y6Y9,160.0
11361,Q9Y6Z4,181.0
11362,uniprot_id,
11363,W5XKT8,324.0


In [17]:
for count, key in enumerate(DataFrameDict2.keys()):
    sumSaf = 0
    assay = DataFrameDict2[key]
    assay.pop("assay_id")

    #calculate sum of spectral counts for each protein
    grouped = DataFrameDict2[key].groupby("uniprot_id").sum().reset_index()
    seqAddedDF = pd.merge(grouped, seqData, on = "uniprot_id")
    seqAddedDF.insert(loc = 2, column = 'SAF', value = 0)
    seqAddedDF.insert(loc = 3, column = 'NSAF', value = 0)
    
    #Calculate SAF score for each protein by dividing sum of spectral counts by protein length
    for index, row in seqAddedDF.iterrows():
        saf = row['quantification']/row['length']
        seqAddedDF.loc[index, 'SAF'] = saf
        # calculate sum of SAF scores in assay
        sumSaf += saf

    # Calculate NSAF score by normalizing each SAF score
    seqAddedDF["NSAF"] = seqAddedDF["SAF"] / sumSaf
    
    del seqAddedDF['length']
    del seqAddedDF['quantification']
    del seqAddedDF['SAF']
    seqAddedDF.insert(loc = 0, column = 'assay_id', value = key)
    DataFrameDict2[key] = seqAddedDF

In [18]:
DataFrameDict2[1]

Unnamed: 0,assay_id,uniprot_id,NSAF
0,1,A0A1B0GUS4,0.001458
1,1,A0AVT1,0.000213
2,1,A6NDG6,0.001049
3,1,A6NHL2,0.000503
4,1,A6NHQ2,0.000336
...,...,...,...
889,1,Q9Y617,0.003641
890,1,Q9Y673,0.000347
891,1,Q9Y678,0.000385
892,1,Q9Y6C9,0.000371


In [19]:
proteinData = pd.DataFrame()

for key in DataFrameDict2.keys():
    proteinData = proteinData.append(DataFrameDict2[key])


In [20]:
proteinData.head()

Unnamed: 0,assay_id,uniprot_id,NSAF
0,1,A0A1B0GUS4,0.001458
1,1,A0AVT1,0.000213
2,1,A6NDG6,0.001049
3,1,A6NHL2,0.000503
4,1,A6NHQ2,0.000336


In [21]:
# write NSAF proteome to file
proteinData.to_csv(path_or_buf = 'proteome_nsaf_tine.csv', index = False)

In [3]:
df = pd.read_csv( '../Atlas_and_parser/Atlas_output/proteome_nsaf_update0806.csv')
df.head()

Unnamed: 0,assay_id,uniprot_id,NSAF
0,13988,A2RRP1,0.000259
1,13988,A5YM72,0.000372
2,13988,O00483,0.00759
3,13988,O14495,0.000988
4,13988,O14594,0.000931
