In [2]:
import pandas as pd
import numpy as np 
import glob
from biopandas.pdb import PandasPdb
import warnings
warnings.filterwarnings("ignore")
from get_distances import *

In [3]:
phosphosite_data = pd.read_csv("/qfs/projects/proteometer/phospho_site_plus_human_20240422.tsv", delimiter='\t')
pka_data = pd.read_csv("/rcfs/projects/proteometer/all_pka.csv", header=0)

In [4]:
pka_data = pka_data.drop(columns=["Unnamed: 0"])
pka_data

Unnamed: 0,uniprotID,AA,res_number,pK,state,position
0,A0PJZ3,NTR,5001,6.891614,undefined,5001.0
1,A0PJZ3,LYS,2,10.328119,protonated,2.0
2,A0PJZ3,SER,5,,undefined,5.0
3,A0PJZ3,LYS,6,10.375520,protonated,6.0
4,A0PJZ3,SER,24,,undefined,24.0
...,...,...,...,...,...,...
5916609,P51587,ASP,1397,3.790052,deprotonated,
5916610,P51587,CYS,1398,9.230363,protonated,
5916611,P51587,THR,1399,,undefined,
5916612,P51587,CTR,6400,2.997512,deprotonated,


In [5]:
phosphosite_data


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,,3.0,1.0,,0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,,8.0,,,0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,,,4.0,,0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,,,1.0,,0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,,4.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389029,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,,1.0,,,0
389030,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,,1.0,,,0
389031,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,,1.0,,,0
389032,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,1.0,,,,0


In [6]:
# clean phosphosite_data
phosphosite_data['RES_NUM'] = phosphosite_data['MOD_RSD'].str.split('-').str[0].str[1:] # split string on '-' and get rid of first letter
phosphosite_data['PKA_ID'] = phosphosite_data['ACC_ID'] +"_"+ phosphosite_data['RES_NUM']
phosphosite_data


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,,3.0,1.0,,0,2,P31946_2
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,,8.0,,,0,6,P31946_6
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,,,4.0,,0,21,P31946_21
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,,,1.0,,0,32,P31946_32
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,,4.0,,,0,39,P31946_39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389029,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,,1.0,,,0,450,Q9UKY1_450
389030,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,,1.0,,,0,891,Q9Y2X9_891
389031,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,,1.0,,,0,1196,O15014_1196
389032,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,1.0,,,,0,169,Q15942_169


In [7]:
pka_data['PKA_ID'] = pka_data['uniprotID'].str.split('-').str[0] +"_"+ pka_data['res_number'].apply(str)
pka_data

Unnamed: 0,uniprotID,AA,res_number,pK,state,position,PKA_ID
0,A0PJZ3,NTR,5001,6.891614,undefined,5001.0,A0PJZ3_5001
1,A0PJZ3,LYS,2,10.328119,protonated,2.0,A0PJZ3_2
2,A0PJZ3,SER,5,,undefined,5.0,A0PJZ3_5
3,A0PJZ3,LYS,6,10.375520,protonated,6.0,A0PJZ3_6
4,A0PJZ3,SER,24,,undefined,24.0,A0PJZ3_24
...,...,...,...,...,...,...,...
5916609,P51587,ASP,1397,3.790052,deprotonated,,P51587_1397
5916610,P51587,CYS,1398,9.230363,protonated,,P51587_1398
5916611,P51587,THR,1399,,undefined,,P51587_1399
5916612,P51587,CTR,6400,2.997512,deprotonated,,P51587_6400


In [8]:
len(pka_data['PKA_ID'].unique())

4350682

In [9]:
len(pka_data['uniprotID'].unique())

20595

In [10]:
full_data = pd.merge(phosphosite_data, pka_data, on="PKA_ID", how = "left")
print(full_data.shape)
full_data

(436145, 23)


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0


In [11]:
full_noduplicates_data = full_data.drop_duplicates()
full_noduplicates_data

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0


In [12]:
print(pka_data.shape)
print(phosphosite_data.shape)
print(full_noduplicates_data.shape) # 1468 of them didn't match up 
# I guess this makes sense. pka didn't output for ALL residues, and that's not that large of a fraction


(5916614, 7)
(389034, 17)
(436012, 23)


In [13]:
full_noduplicates_data['PKA_ID'].nunique()

370369

In [14]:
full_noduplicates_data['PKA_ID'].value_counts()

PKA_ID
Q8WZ42_1054    91
Q8WZ42_565     89
Q8WZ42_308     87
Q8WZ42_1159    86
Q8WZ42_300     86
               ..
O75592_2348     1
O75592_2344     1
O75592_2342     1
O75592_2338     1
O75592_2701     1
Name: count, Length: 370369, dtype: int64

In [15]:
print(full_data['pK'].notna().sum(), "rows have pka")
print(189514/425877,"have pka annotations")

189765 rows have pka
0.44499702965879817 have pka annotations


In [16]:
# adding the interfaces data
interfaces_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/interfaces/2024.05.28_interface_summary_5A.tsv", delimiter='\t', header=0)
interfaces_data

Unnamed: 0,interaction_id,pdockq,uniprot_id1,uniprot_id2,chain1,chain2,ifresid1,ifresid2,sources,n_references,pdb
0,O75106_Q16853,0.74,O75106,Q16853,A,B,"R169,A203,A204,V205,H206,L212,R213,W220,N226,I...","P39,V209,L218,Q219,W226,N232,I233,S234,G235,A2...","BioGRID,humap,intact,string",2,O75106/O75106_Q16853.pdb
1,Q15118_Q15118,0.73,Q15118,Q15118,A,B,"S53,P54,P56,Y179,D182,R183,M186,L255,A257,H304...","S53,P54,P56,Y179,D182,R183,M186,E253,L255,A257...","BioGRID,intact",2,Q15118/Q15118_Q15118.pdb
2,P11142_Q92598,0.73,P11142,Q92598,A,B,"K25,E27,I28,A30,N31,D32,Q33,G34,R36,E48,L50,D5...","R19,A27,N28,E29,F30,S31,R33,N54,T58,Y184,R261,...","BioGRID,corum,humap,intact,otar,string,xlinkdb",9,P11142/P11142_Q92598.pdb
3,Q13326_Q16585,0.73,Q13326,Q16585,A,B,"V40,L41,L43,L44,L47,V48,N50,L51,T54,I55,L58,F6...","V68,I69,L71,L72,L75,A76,I78,N79,I82,I86,M100,F...","corum,otar,string",0,Q13326/Q13326_Q16585.pdb
4,Q13326_Q92629,0.73,Q13326,Q92629,A,B,"K33,L36,Y37,V40,L41,L43,L44,L47,V48,N50,L51,T5...","R30,K31,C33,L34,F37,V38,L40,L41,L44,I45,V47,N4...","corum,string",0,Q13326/Q13326_Q92629.pdb
...,...,...,...,...,...,...,...,...,...,...,...
486094,P23193_Q92889,0.00,,,,,,,otar,0,
486095,P23193_Q92541,0.00,,,,,,,"BioGRID,intact,otar,string",1,
486096,P23193_Q8WX92,0.00,,,,,,,"otar,string",0,
486097,P23193_Q8WVC0,0.00,,,,,,,"BioGRID,intact,otar,string",2,


In [17]:
# adding pockets data
pockets_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/pockets/2024.05.28_pockets.tsv", delimiter='\t', header=0)
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190
...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350


In [18]:
pockets_data["struct_id"].str.split('-')

0         [A0A024R1R8, F1]
1         [A0A024R1R8, F1]
2         [A0A024RBG1, F1]
3         [A0A024RBG1, F1]
4         [A0A024RBG1, F1]
                ...       
547396        [X6R8D5, F1]
547397        [X6R8D5, F1]
547398        [X6R8D5, F1]
547399        [X6R8D5, F1]
547400        [X6R8D5, F1]
Name: struct_id, Length: 547401, dtype: object

In [19]:
sum(full_noduplicates_data["ACC_ID"].str.contains("F1", case=False, na=False).astype(int))
# we don't have F1 at the end of some of the uniprot-IDs

674

In [20]:
sum(pockets_data["struct_id"].str.contains("F1", case=False, na=False).astype(int))

547401

In [21]:
# getting rid of F1 in the data
pockets_data['uniprot_id'] = pockets_data['struct_id'].str.split('-').str[0]
full_noduplicates_data['uniprot_id'] = full_noduplicates_data['ACC_ID'].str.split('-').str[0]
full_noduplicates_data.drop(columns = ['uniprotID'])

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,AA,res_number,pK,state,position,uniprot_id
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,THR,2.0,,undefined,2.0,P31946
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,SER,6.0,,undefined,6.0,P31946
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,TYR,21.0,11.100927,protonated,21.0,P31946
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,THR,32.0,,undefined,32.0,P31946
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,SER,39.0,,undefined,39.0,P31946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,SER,450.0,,undefined,450.0,Q9UKY1
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,SER,891.0,,undefined,891.0,Q9Y2X9
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,SER,1196.0,,undefined,1196.0,O15014
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,SER,169.0,,undefined,169.0,Q15942


In [22]:
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled,uniprot_id
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096,A0A024R1R8
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057,A0A024R1R8
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587,A0A024RBG1
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063,A0A024RBG1
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190,A0A024RBG1
...,...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598,X6R8D5
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205,X6R8D5
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313,X6R8D5
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350,X6R8D5


In [23]:
pockets_data['full_id'] = pockets_data['struct_id'] +"_pocket"+ pockets_data['pocket_id'].apply(str)
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled,uniprot_id,full_id
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096,A0A024R1R8,A0A024R1R8-F1_pocket1
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057,A0A024R1R8,A0A024R1R8-F1_pocket2
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587,A0A024RBG1,A0A024RBG1-F1_pocket1
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063,A0A024RBG1,A0A024RBG1-F1_pocket2
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190,A0A024RBG1,A0A024RBG1-F1_pocket3
...,...,...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598,X6R8D5,X6R8D5-F1_pocket3
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205,X6R8D5,X6R8D5-F1_pocket4
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313,X6R8D5,X6R8D5-F1_pocket5
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350,X6R8D5,X6R8D5-F1_pocket6


# The procedure:

PSP, pocket, interface

start  with PSP:

loop through unique uniprot id

for each unique uniprot id

select all the PTM sites in the same protein from PSP
select all pockets ..... from procket table
select all interfaces .... from interface table (any pair has uniprot id)

loop through ptm  sites

for each site, 

calculate prockets distance
if site in any pockets: (for loop)
    mindistance = 0
    in pockets = True
    pockets number = [#]
else 
    in pcikates = False
loop through all the pockets
calculate the first taken as minial
       minidistance = #
       pockets number = #
 calcualte any pairt betwene site and pocket residue
 compare the new distance to the minidistant 
  if < 
    replace
    elif =
    append
    
    




- match up uniprot id from pockets data to ptm 
- check if any ptms are located inside of the pocket 
- new column w/ if inside the pocket T/F
-if FALSE, find the closest PTM 


-new column w/ distance from ptm and closest pocket residue

### Pockets Data

In [24]:
# making a smaller test dataset
test_psp = full_noduplicates_data.sample(n = 10)
test_psp['closest_pocket'] = "NaN"
test_psp['inside_pocket'] = 0
test_psp['distance_from_pocket'] = "NaN"
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_pocket,inside_pocket,distance_from_pocket
121083,LARS1,LARS,Q9P2J5,5q32,Y275-p,5368112,human,134.47,tRNA-synt_1,kLkVLEPyPskLSGL,...,Q9P2J5,TYR,275.0,11.320096,protonated,275.0,Q9P2J5,,0,
254188,VAC14,VAC14,Q08AM6,16q22.1-q22.2,S770-p,7445896,human,87.97,,LEVRHQRsGRGDHLD,...,Q08AM6,SER,770.0,,undefined,770.0,Q08AM6,,0,
268226,ZNF48,ZNF553,Q96MX3,16p11.2,S492-p,25759713,human,67.82,zf-C2H2,GKGFADSsARVKHLR,...,Q96MX3,SER,492.0,,undefined,492.0,Q96MX3,,0,
156824,NR1H4,NR1H4,Q96RI1,12q23.1,Y91-p,3919646,human,55.91,,PQQPEEWySPGIYEL,...,Q96RI1,TYR,91.0,9.709709,protonated,91.0,Q96RI1,,0,
13067,ANK3,ANK3,Q12955,10q21.2,S44-p,23053589,human,480.41,,KKsDANAsYLRAARA,...,Q12955,SER,44.0,,undefined,1844.0,Q12955,,0,
48360,CLDN3,Claudin-3,O15551,7q11.23,T212-p,8086472,human,23.32,,GPGAsLGtGyDRKDy,...,O15551,THR,212.0,,undefined,212.0,O15551,,0,
237444,TTN,Titin,Q8WZ42,2q31.2,Y1126-p,23202308,human,3816.03,I-set,GVPLTtGyRYKVSYN,...,Q8WZ42-F15,ASP,1126.0,3.853447,deprotonated,3926.0,Q8WZ42,,0,
130866,AFF4,MCEF,Q9UHB7,5q31.1,T117-p,23124833,human,127.46,AF-4,PVGPAPStSQsQKRs,...,Q9UHB7,THR,117.0,,undefined,117.0,Q9UHB7,,0,
334297,GAPVD1,GAPVD1,Q14C86,9q33.3,K1353-ub,12395344,human,164.98,,EHIQRLSkVVTANHR,...,Q14C86,LYS,1353.0,10.465263,protonated,1353.0,Q14C86,,0,
250722,UBR5,UBR5,O95071,8q22.3,S191-p,11952917,human,309.35,E3_UbLigase_EDD,VIPEELIsQAQVVLQ,...,O95071,SER,191.0,,undefined,191.0,O95071,,0,


In [25]:
unique_uniprots = test_psp['uniprot_id'].unique() # get all of the unique uniprots

# for each unique uniprotID...
for uniprot in unique_uniprots:
    # isolate to psp and pockets in each uniprot
    psp_only_uniprot = test_psp[test_psp.uniprot_id == uniprot]
    pocket_only_uniprot = pockets_data[pockets_data.uniprot_id == uniprot]


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            #print(psp_only_uniprot)
            #print(phosphosite_row_index)
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file
            
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                #print(pocket_residues)
                if residue_num in pocket_residues:
                    test_psp.loc[phosphosite_row_index,'inside_pocket'] = 1 # if residue is in the pocket, put 1 in the inside pocket column
                    test_psp.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                    test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = 0 
                    break # break because you don't want to contiue looking for pockets (and therefore overwrite the inside pocket and closest pocket)

            if test_psp.loc[phosphosite_row_index,'inside_pocket'] == 0: # if the phosphosite isn't in any pockets
                print("phosphosite isn't in any pockets")
                min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
                for pocket_index in pocket_only_uniprot.index:
                    input_struct = ppdb.df['ATOM']
                    #print(input_struct)
                    new_dist = find_mean_distances(input_struct, residue_num, pocket_residues)
                    if residue_num:
                        if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                            test_psp.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                            test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = new_dist # replace distance_from_pocket with min_dist
                            min_dist = new_dist 
                            print("added smallest distance:", min_dist)
                
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            # use the residue # to get the coordinates in space from pdb file
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                if residue_num in pocket_residues:
                    # fill all with NaN bc we can't find a pdb file
                    test_psp.loc[phosphosite_row_index,'inside_pocket'] = 'NaN' 
                    test_psp.loc[phosphosite_row_index,'closest_pocket'] = 'NaN' 
                    test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = 'NaN'

        

                


      

name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q9P2J5-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 35.28928792451977
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q08AM6-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 107.26650310886066
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q96MX3-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 55.920159175199466
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q96RI1-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 37.475366681739914
name of pdb is: []
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-O15551-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 61.06409198988101
name of pdb is: []
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q9UHB7-F1-model_v4.pdb

In [26]:
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_pocket,inside_pocket,distance_from_pocket
121083,LARS1,LARS,Q9P2J5,5q32,Y275-p,5368112,human,134.47,tRNA-synt_1,kLkVLEPyPskLSGL,...,Q9P2J5,TYR,275.0,11.320096,protonated,275.0,Q9P2J5,Q9P2J5-F1_pocket1,0,35.289288
254188,VAC14,VAC14,Q08AM6,16q22.1-q22.2,S770-p,7445896,human,87.97,,LEVRHQRsGRGDHLD,...,Q08AM6,SER,770.0,,undefined,770.0,Q08AM6,Q08AM6-F1_pocket1,0,107.266503
268226,ZNF48,ZNF553,Q96MX3,16p11.2,S492-p,25759713,human,67.82,zf-C2H2,GKGFADSsARVKHLR,...,Q96MX3,SER,492.0,,undefined,492.0,Q96MX3,Q96MX3-F1_pocket1,0,55.920159
156824,NR1H4,NR1H4,Q96RI1,12q23.1,Y91-p,3919646,human,55.91,,PQQPEEWySPGIYEL,...,Q96RI1,TYR,91.0,9.709709,protonated,91.0,Q96RI1,Q96RI1-F1_pocket1,0,37.475367
13067,ANK3,ANK3,Q12955,10q21.2,S44-p,23053589,human,480.41,,KKsDANAsYLRAARA,...,Q12955,SER,44.0,,undefined,1844.0,Q12955,,0,
48360,CLDN3,Claudin-3,O15551,7q11.23,T212-p,8086472,human,23.32,,GPGAsLGtGyDRKDy,...,O15551,THR,212.0,,undefined,212.0,O15551,O15551-F1_pocket6,0,61.064092
237444,TTN,Titin,Q8WZ42,2q31.2,Y1126-p,23202308,human,3816.03,I-set,GVPLTtGyRYKVSYN,...,Q8WZ42-F15,ASP,1126.0,3.853447,deprotonated,3926.0,Q8WZ42,,0,
130866,AFF4,MCEF,Q9UHB7,5q31.1,T117-p,23124833,human,127.46,AF-4,PVGPAPStSQsQKRs,...,Q9UHB7,THR,117.0,,undefined,117.0,Q9UHB7,Q9UHB7-F1_pocket1,0,87.453157
334297,GAPVD1,GAPVD1,Q14C86,9q33.3,K1353-ub,12395344,human,164.98,,EHIQRLSkVVTANHR,...,Q14C86,LYS,1353.0,10.465263,protonated,1353.0,Q14C86,Q14C86-F1_pocket1,0,86.274285
250722,UBR5,UBR5,O95071,8q22.3,S191-p,11952917,human,309.35,E3_UbLigase_EDD,VIPEELIsQAQVVLQ,...,O95071,SER,191.0,,undefined,191.0,O95071,,0,


In [27]:
test_psp["inside_pocket"].dtype

dtype('int64')

In [28]:
# saving psp and pockets data for analysis outside of jupyter notebook
pockets_data.to_csv("/people/imal967/git_repos/pheno_analysis/pockets_data.csv")
full_noduplicates_data.to_csv("/people/imal967/git_repos/pheno_analysis/phosphosite_for_pockets.csv")


In [34]:
full_noduplicates_data

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position,uniprot_id
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0,P31946
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0,P31946
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0,P31946
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0,P31946
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0,P31946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0,Q9UKY1
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0,Q9Y2X9
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0,O15014
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0,Q15942


## Interfaces Data
The procedure: 
Similar to the pockets data, 

In [37]:
interfaces_data

Unnamed: 0,interaction_id,pdockq,uniprot_id1,uniprot_id2,chain1,chain2,ifresid1,ifresid2,sources,n_references,pdb
0,O75106_Q16853,0.74,O75106,Q16853,A,B,"R169,A203,A204,V205,H206,L212,R213,W220,N226,I...","P39,V209,L218,Q219,W226,N232,I233,S234,G235,A2...","BioGRID,humap,intact,string",2,O75106/O75106_Q16853.pdb
1,Q15118_Q15118,0.73,Q15118,Q15118,A,B,"S53,P54,P56,Y179,D182,R183,M186,L255,A257,H304...","S53,P54,P56,Y179,D182,R183,M186,E253,L255,A257...","BioGRID,intact",2,Q15118/Q15118_Q15118.pdb
2,P11142_Q92598,0.73,P11142,Q92598,A,B,"K25,E27,I28,A30,N31,D32,Q33,G34,R36,E48,L50,D5...","R19,A27,N28,E29,F30,S31,R33,N54,T58,Y184,R261,...","BioGRID,corum,humap,intact,otar,string,xlinkdb",9,P11142/P11142_Q92598.pdb
3,Q13326_Q16585,0.73,Q13326,Q16585,A,B,"V40,L41,L43,L44,L47,V48,N50,L51,T54,I55,L58,F6...","V68,I69,L71,L72,L75,A76,I78,N79,I82,I86,M100,F...","corum,otar,string",0,Q13326/Q13326_Q16585.pdb
4,Q13326_Q92629,0.73,Q13326,Q92629,A,B,"K33,L36,Y37,V40,L41,L43,L44,L47,V48,N50,L51,T5...","R30,K31,C33,L34,F37,V38,L40,L41,L44,I45,V47,N4...","corum,string",0,Q13326/Q13326_Q92629.pdb
...,...,...,...,...,...,...,...,...,...,...,...
486094,P23193_Q92889,0.00,,,,,,,otar,0,
486095,P23193_Q92541,0.00,,,,,,,"BioGRID,intact,otar,string",1,
486096,P23193_Q8WX92,0.00,,,,,,,"otar,string",0,
486097,P23193_Q8WVC0,0.00,,,,,,,"BioGRID,intact,otar,string",2,


In [39]:
interfaces_data['interaction_id'].nunique()
# all of the interfaces are unique (no duplicates and no switching)

486099