In [1]:
import pandas as pd
import numpy as np 
import glob
from biopandas.pdb import PandasPdb
import warnings
warnings.filterwarnings("ignore")
from get_distances import *

In [2]:
phosphosite_data = pd.read_csv("/qfs/projects/proteometer/phospho_site_plus_human_20240422.tsv", delimiter='\t')
pka_data = pd.read_csv("/rcfs/projects/proteometer/all_pka.csv", header=0)

In [3]:
pka_data = pka_data.drop(columns=["Unnamed: 0"])
pka_data

Unnamed: 0,uniprotID,AA,res_number,pK,state,position
0,A0PJZ3,NTR,5001,6.891614,undefined,5001.0
1,A0PJZ3,LYS,2,10.328119,protonated,2.0
2,A0PJZ3,SER,5,,undefined,5.0
3,A0PJZ3,LYS,6,10.375520,protonated,6.0
4,A0PJZ3,SER,24,,undefined,24.0
...,...,...,...,...,...,...
5916609,P51587,ASP,1397,3.790052,deprotonated,
5916610,P51587,CYS,1398,9.230363,protonated,
5916611,P51587,THR,1399,,undefined,
5916612,P51587,CTR,6400,2.997512,deprotonated,


In [4]:
phosphosite_data


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,,3.0,1.0,,0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,,8.0,,,0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,,,4.0,,0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,,,1.0,,0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,,4.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389029,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,,1.0,,,0
389030,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,,1.0,,,0
389031,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,,1.0,,,0
389032,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,1.0,,,,0


In [5]:
# clean phosphosite_data
phosphosite_data['RES_NUM'] = phosphosite_data['MOD_RSD'].str.split('-').str[0].str[1:] # split string on '-' and get rid of first letter
phosphosite_data['PKA_ID'] = phosphosite_data['ACC_ID'] +"_"+ phosphosite_data['RES_NUM']
phosphosite_data


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,,3.0,1.0,,0,2,P31946_2
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,,8.0,,,0,6,P31946_6
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,,,4.0,,0,21,P31946_21
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,,,1.0,,0,32,P31946_32
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,,4.0,,,0,39,P31946_39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389029,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,,1.0,,,0,450,Q9UKY1_450
389030,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,,1.0,,,0,891,Q9Y2X9_891
389031,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,,1.0,,,0,1196,O15014_1196
389032,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,1.0,,,,0,169,Q15942_169


In [6]:
pka_data['PKA_ID'] = pka_data['uniprotID'].str.split('-').str[0] +"_"+ pka_data['res_number'].apply(str)
pka_data

Unnamed: 0,uniprotID,AA,res_number,pK,state,position,PKA_ID
0,A0PJZ3,NTR,5001,6.891614,undefined,5001.0,A0PJZ3_5001
1,A0PJZ3,LYS,2,10.328119,protonated,2.0,A0PJZ3_2
2,A0PJZ3,SER,5,,undefined,5.0,A0PJZ3_5
3,A0PJZ3,LYS,6,10.375520,protonated,6.0,A0PJZ3_6
4,A0PJZ3,SER,24,,undefined,24.0,A0PJZ3_24
...,...,...,...,...,...,...,...
5916609,P51587,ASP,1397,3.790052,deprotonated,,P51587_1397
5916610,P51587,CYS,1398,9.230363,protonated,,P51587_1398
5916611,P51587,THR,1399,,undefined,,P51587_1399
5916612,P51587,CTR,6400,2.997512,deprotonated,,P51587_6400


In [7]:
len(pka_data['PKA_ID'].unique())

4350682

In [8]:
len(pka_data['uniprotID'].unique())

20595

In [9]:
full_data = pd.merge(phosphosite_data, pka_data, on="PKA_ID", how = "left")
print(full_data.shape)
full_data

(436145, 23)


Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0


In [10]:
full_noduplicates_data = full_data.drop_duplicates()
full_noduplicates_data

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0


In [11]:
print(pka_data.shape)
print(phosphosite_data.shape)
print(full_noduplicates_data.shape) # 1468 of them didn't match up 
# I guess this makes sense. pka didn't output for ALL residues, and that's not that large of a fraction


(5916614, 7)
(389034, 17)
(436012, 23)


In [12]:
full_noduplicates_data['PKA_ID'].nunique()

370369

In [13]:
full_noduplicates_data['PKA_ID'].value_counts()

PKA_ID
Q8WZ42_1054    91
Q8WZ42_565     89
Q8WZ42_308     87
Q8WZ42_1159    86
Q8WZ42_300     86
               ..
O75592_2348     1
O75592_2344     1
O75592_2342     1
O75592_2338     1
O75592_2701     1
Name: count, Length: 370369, dtype: int64

In [14]:
print(full_data['pK'].notna().sum(), "rows have pka")
print(189514/425877,"have pka annotations")

189765 rows have pka
0.44499702965879817 have pka annotations


In [15]:
# adding the interfaces data
interfaces_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/interfaces/2024.05.28_interface_summary_5A.tsv", delimiter='\t', header=0)
interfaces_data

Unnamed: 0,interaction_id,pdockq,uniprot_id1,uniprot_id2,chain1,chain2,ifresid1,ifresid2,sources,n_references,pdb
0,O75106_Q16853,0.74,O75106,Q16853,A,B,"R169,A203,A204,V205,H206,L212,R213,W220,N226,I...","P39,V209,L218,Q219,W226,N232,I233,S234,G235,A2...","BioGRID,humap,intact,string",2,O75106/O75106_Q16853.pdb
1,Q15118_Q15118,0.73,Q15118,Q15118,A,B,"S53,P54,P56,Y179,D182,R183,M186,L255,A257,H304...","S53,P54,P56,Y179,D182,R183,M186,E253,L255,A257...","BioGRID,intact",2,Q15118/Q15118_Q15118.pdb
2,P11142_Q92598,0.73,P11142,Q92598,A,B,"K25,E27,I28,A30,N31,D32,Q33,G34,R36,E48,L50,D5...","R19,A27,N28,E29,F30,S31,R33,N54,T58,Y184,R261,...","BioGRID,corum,humap,intact,otar,string,xlinkdb",9,P11142/P11142_Q92598.pdb
3,Q13326_Q16585,0.73,Q13326,Q16585,A,B,"V40,L41,L43,L44,L47,V48,N50,L51,T54,I55,L58,F6...","V68,I69,L71,L72,L75,A76,I78,N79,I82,I86,M100,F...","corum,otar,string",0,Q13326/Q13326_Q16585.pdb
4,Q13326_Q92629,0.73,Q13326,Q92629,A,B,"K33,L36,Y37,V40,L41,L43,L44,L47,V48,N50,L51,T5...","R30,K31,C33,L34,F37,V38,L40,L41,L44,I45,V47,N4...","corum,string",0,Q13326/Q13326_Q92629.pdb
...,...,...,...,...,...,...,...,...,...,...,...
486094,P23193_Q92889,0.00,,,,,,,otar,0,
486095,P23193_Q92541,0.00,,,,,,,"BioGRID,intact,otar,string",1,
486096,P23193_Q8WX92,0.00,,,,,,,"otar,string",0,
486097,P23193_Q8WVC0,0.00,,,,,,,"BioGRID,intact,otar,string",2,


In [16]:
# adding pockets data
pockets_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/pockets/2024.05.28_pockets.tsv", delimiter='\t', header=0)
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190
...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350


In [17]:
pockets_data["struct_id"].str.split('-')

0         [A0A024R1R8, F1]
1         [A0A024R1R8, F1]
2         [A0A024RBG1, F1]
3         [A0A024RBG1, F1]
4         [A0A024RBG1, F1]
                ...       
547396        [X6R8D5, F1]
547397        [X6R8D5, F1]
547398        [X6R8D5, F1]
547399        [X6R8D5, F1]
547400        [X6R8D5, F1]
Name: struct_id, Length: 547401, dtype: object

In [18]:
sum(full_noduplicates_data["ACC_ID"].str.contains("F1", case=False, na=False).astype(int))
# we don't have F1 at the end of some of the uniprot-IDs

674

In [19]:
sum(pockets_data["struct_id"].str.contains("F1", case=False, na=False).astype(int))

547401

In [20]:
# getting rid of F1 in the data
pockets_data['uniprot_id'] = pockets_data['struct_id'].str.split('-').str[0]
full_noduplicates_data['uniprot_id'] = full_noduplicates_data['ACC_ID'].str.split('-').str[0]
full_noduplicates_data.drop(columns = ['uniprotID'])

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,CST_CAT#,Ambiguous_Site,RES_NUM,PKA_ID,AA,res_number,pK,state,position,uniprot_id
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,,0,2,P31946_2,THR,2.0,,undefined,2.0,P31946
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,,0,6,P31946_6,SER,6.0,,undefined,6.0,P31946
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,,0,21,P31946_21,TYR,21.0,11.100927,protonated,21.0,P31946
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,,0,32,P31946_32,THR,32.0,,undefined,32.0,P31946
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,,0,39,P31946_39,SER,39.0,,undefined,39.0,P31946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,,0,450,Q9UKY1_450,SER,450.0,,undefined,450.0,Q9UKY1
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,,0,891,Q9Y2X9_891,SER,891.0,,undefined,891.0,Q9Y2X9
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,,0,1196,O15014_1196,SER,1196.0,,undefined,1196.0,O15014
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,,0,169,Q15942_169,SER,169.0,,undefined,169.0,Q15942


In [21]:
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled,uniprot_id
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096,A0A024R1R8
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057,A0A024R1R8
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587,A0A024RBG1
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063,A0A024RBG1
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190,A0A024RBG1
...,...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598,X6R8D5
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205,X6R8D5
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313,X6R8D5
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350,X6R8D5


In [22]:
pockets_data['full_id'] = pockets_data['struct_id'] +"_pocket"+ pockets_data['pocket_id'].apply(str)
pockets_data

Unnamed: 0,struct_id,pocket_id,pocket_rad_gyration,pocket_energy_per_vol,pocket_buriedness,pocket_resid,pocket_plddt_mean,pocket_score_combined_scaled,uniprot_id,full_id
0,A0A024R1R8-F1,1,4.042788,0.316535,0.772959,"{21,22,23,24,25,26,28,29,32}",83.937778,283.034096,A0A024R1R8,A0A024R1R8-F1_pocket1
1,A0A024R1R8-F1,2,3.175737,0.347111,0.808219,"{12,13,14,15,16,17}",61.206667,102.718057,A0A024R1R8,A0A024R1R8-F1_pocket2
2,A0A024RBG1-F1,1,7.310256,0.435597,0.856184,"{2,3,4,5,6,7,8,9,10,18,20,21,22,39,40,41,42,47...",89.456190,979.457587,A0A024RBG1,A0A024RBG1-F1_pocket1
3,A0A024RBG1-F1,2,6.350910,0.389675,0.814896,"{54,57,58,60,61,62,64,65,67,68,69,73,74,75,76,...",83.186923,938.222063,A0A024RBG1,A0A024RBG1-F1_pocket2
4,A0A024RBG1-F1,3,3.827945,0.378204,0.806045,"{1,2,3,4,5,6,109,110,112,113,114}",77.053636,422.703190,A0A024RBG1,A0A024RBG1-F1_pocket3
...,...,...,...,...,...,...,...,...,...,...
547396,X6R8D5-F1,3,3.894257,0.338401,0.777778,"{86,87,88,89,90,91,92,93,98,100,101,102,103}",56.513846,99.047598,X6R8D5,X6R8D5-F1_pocket3
547397,X6R8D5-F1,4,4.196873,0.328247,0.768473,"{43,44,45,46,47,48,49,50,127}",59.902222,107.547205,X6R8D5,X6R8D5-F1_pocket4
547398,X6R8D5-F1,5,4.465454,0.314353,0.751790,"{81,87,90,91,92,93,99,100,101,102,103,104,105}",58.298462,95.185313,X6R8D5,X6R8D5-F1_pocket5
547399,X6R8D5-F1,6,3.198691,0.398147,0.825342,"{66,67,68,69,71,72,73,75,76,77}",61.416000,122.028350,X6R8D5,X6R8D5-F1_pocket6


# The procedure:

PSP, pocket, interface

start  with PSP:

loop through unique uniprot id

for each unique uniprot id

select all the PTM sites in the same protein from PSP
select all pockets ..... from procket table
select all interfaces .... from interface table (any pair has uniprot id)

loop through ptm  sites

for each site, 

calculate prockets distance
if site in any pockets: (for loop)
    mindistance = 0
    in pockets = True
    pockets number = [#]
else 
    in pcikates = False
loop through all the pockets
calculate the first taken as minial
       minidistance = #
       pockets number = #
 calcualte any pairt betwene site and pocket residue
 compare the new distance to the minidistant 
  if < 
    replace
    elif =
    append
    
    




- match up uniprot id from pockets data to ptm 
- check if any ptms are located inside of the pocket 
- new column w/ if inside the pocket T/F
-if FALSE, find the closest PTM 


-new column w/ distance from ptm and closest pocket residue

### Pockets Data

In [23]:
# making a smaller test dataset
test_psp = full_noduplicates_data.sample(n = 10)
test_psp['closest_pocket'] = "NaN"
test_psp['inside_pocket'] = 0
test_psp['distance_from_pocket'] = "NaN"
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_pocket,inside_pocket,distance_from_pocket
16844,APOB,APOB,P04114,2p24.1,Y1287-p,25283000,human,515.54,,KSDGRVKytLNKNSL,...,P04114,TYR,1287.0,10.061789,protonated,1287.0,P04114,,0,
271509,ZNF98,ZNF98,A6NK75,19p12,S169-p,29651979,human,65.8,,VFHKFSNsNRHKIGH,...,A6NK75,SER,169.0,,undefined,169.0,A6NK75,,0,
176997,PMFBP1,PMFBP1 iso1,Q8TBY8-1,16q22.2,T690-p,56572463,human,119.03,,SSLNKYNtsQQVIQD,...,,,,,,,Q8TBY8,,0,
116856,KIF24,KIF24,Q5T7B8,9p13.3,S1131-p,4271725,human,151.9,,GGDLPALsPsPIRQH,...,Q5T7B8,SER,1131.0,,undefined,1131.0,Q5T7B8,,0,
242214,CNPY3,TNRC5,Q9BT09,6p21.1,Y110-p,35479739,human,30.75,DUF3456,ICKRLLDySLHkERt,...,Q9BT09,TYR,110.0,13.6615,protonated,110.0,Q9BT09,,0,
175,EIF4EBP1,4E-BP1,Q13541,8p11.23,S5-p,11536050,human,12.58,eIF_4EBP,___MsGGsSCsQtPs,...,Q13541,SER,5.0,,undefined,5.0,Q13541,,0,
201289,S100A11,S100A11,P31949,1q21.3,T10-p,10888705,human,11.74,S_100,kIssPtEtERCIEsL,...,P31949,THR,10.0,,undefined,10.0,P31949,,0,
353354,RIOX2,MINA,Q8IUF8,3q11.2,K87-ub,15392729,human,52.8,,LFkLTDLkSLCSRGM,...,Q8IUF8,LYS,87.0,10.388098,protonated,87.0,Q8IUF8,,0,
356030,MYNN,MYNN,Q9NPC7,3q26.2,K155-ub,573787326,human,68.68,,RDYNNREksEVstDL,...,Q9NPC7,LYS,155.0,11.147859,protonated,155.0,Q9NPC7,,0,
79954,FAT1,FAT,Q14517,4q35.2,T535-p,50778879,human,506.27,Cadherin,ELMPRVytLRIRASD,...,Q14517,THR,535.0,,undefined,535.0,Q14517,,0,


In [24]:
unique_uniprots = test_psp['uniprot_id'].unique() # get all of the unique uniprots

# for each unique uniprotID...
for uniprot in unique_uniprots:
    # isolate to psp and pockets in each uniprot
    psp_only_uniprot = test_psp[test_psp.uniprot_id == uniprot]
    pocket_only_uniprot = pockets_data[pockets_data.uniprot_id == uniprot]


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])
        


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            #print(psp_only_uniprot)
            #print(phosphosite_row_index)
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file
            
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                #print(pocket_residues)
                if residue_num in pocket_residues:
                    test_psp.loc[phosphosite_row_index,'inside_pocket'] = 1 # if residue is in the pocket, put 1 in the inside pocket column
                    test_psp.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                    test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = 0 
                    break # break because you don't want to contiue looking for pockets (and therefore overwrite the inside pocket and closest pocket)

                if test_psp.loc[phosphosite_row_index,'inside_pocket'] == 0: # if the phosphosite isn't in any pockets
                    print("phosphosite isn't in any pockets")
                    input_struct = ppdb.df['ATOM']
                    #print(input_struct)
                    new_dist = find_mean_distances(input_struct, residue_num, pocket_residues)
                    if residue_num:
                        if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                            test_psp.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                            test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = new_dist # replace distance_from_pocket with min_dist
                            min_dist = new_dist 
                            print("added smallest distance:", min_dist)
                
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            # use the residue # to get the coordinates in space from pdb file
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                if residue_num in pocket_residues:
                    # fill all with NaN bc we can't find a pdb file
                    test_psp.loc[phosphosite_row_index,'inside_pocket'] = 'NaN' 
                    test_psp.loc[phosphosite_row_index,'closest_pocket'] = 'NaN' 
                    test_psp.loc[phosphosite_row_index,'distance_from_pocket'] = 'NaN'

        

                


      

name of pdb is: []
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-A6NK75-F1-model_v4.pdb']
phosphosite isn't in any pockets
added smallest distance: 37.67866901169767
phosphosite isn't in any pockets
phosphosite isn't in any pockets
added smallest distance: 35.82602139107829
phosphosite isn't in any pockets
added smallest distance: 24.415268887816453
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
added smallest distance: 23.94234641839175
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
added smallest distance: 16.785293062498923
phosphosite isn't in any pockets
phosphosite isn't in any pockets
phosphosite isn't in any pockets
added smallest distance: 14.4974169957062
phosphosite isn't in any pocket

In [25]:
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_pocket,inside_pocket,distance_from_pocket
16844,APOB,APOB,P04114,2p24.1,Y1287-p,25283000,human,515.54,,KSDGRVKytLNKNSL,...,P04114,TYR,1287.0,10.061789,protonated,1287.0,P04114,,0,
271509,ZNF98,ZNF98,A6NK75,19p12,S169-p,29651979,human,65.8,,VFHKFSNsNRHKIGH,...,A6NK75,SER,169.0,,undefined,169.0,A6NK75,A6NK75-F1_pocket18,0,14.497417
176997,PMFBP1,PMFBP1 iso1,Q8TBY8-1,16q22.2,T690-p,56572463,human,119.03,,SSLNKYNtsQQVIQD,...,,,,,,,Q8TBY8,,0,
116856,KIF24,KIF24,Q5T7B8,9p13.3,S1131-p,4271725,human,151.9,,GGDLPALsPsPIRQH,...,Q5T7B8,SER,1131.0,,undefined,1131.0,Q5T7B8,Q5T7B8-F1_pocket21,0,25.983545
242214,CNPY3,TNRC5,Q9BT09,6p21.1,Y110-p,35479739,human,30.75,DUF3456,ICKRLLDySLHkERt,...,Q9BT09,TYR,110.0,13.6615,protonated,110.0,Q9BT09,Q9BT09-F1_pocket1,0,10.090006
175,EIF4EBP1,4E-BP1,Q13541,8p11.23,S5-p,11536050,human,12.58,eIF_4EBP,___MsGGsSCsQtPs,...,Q13541,SER,5.0,,undefined,5.0,Q13541,Q13541-F1_pocket5,0,23.357221
201289,S100A11,S100A11,P31949,1q21.3,T10-p,10888705,human,11.74,S_100,kIssPtEtERCIEsL,...,P31949,THR,10.0,,undefined,10.0,P31949,P31949-F1_pocket4,0,18.784046
353354,RIOX2,MINA,Q8IUF8,3q11.2,K87-ub,15392729,human,52.8,,LFkLTDLkSLCSRGM,...,Q8IUF8,LYS,87.0,10.388098,protonated,87.0,Q8IUF8,Q8IUF8-F1_pocket20,0,6.274892
356030,MYNN,MYNN,Q9NPC7,3q26.2,K155-ub,573787326,human,68.68,,RDYNNREksEVstDL,...,Q9NPC7,LYS,155.0,11.147859,protonated,155.0,Q9NPC7,Q9NPC7-F1_pocket5,0,8.238787
79954,FAT1,FAT,Q14517,4q35.2,T535-p,50778879,human,506.27,Cadherin,ELMPRVytLRIRASD,...,Q14517,THR,535.0,,undefined,535.0,Q14517,,0,


In [26]:
test_psp["inside_pocket"].dtype

dtype('int64')

In [27]:
# saving psp and pockets data for analysis outside of jupyter notebook
pockets_data.to_csv("/people/imal967/git_repos/pheno_analysis/pockets_data.csv")
full_noduplicates_data.to_csv("/people/imal967/git_repos/pheno_analysis/phosphosite_for_pockets.csv")


In [28]:
full_noduplicates_data

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position,uniprot_id
0,YWHAB,14-3-3 beta,P31946,20q13.12,T2-p,15718712,human,28.08,,______MtMDksELV,...,0,2,P31946_2,P31946,THR,2.0,,undefined,2.0,P31946
1,YWHAB,14-3-3 beta,P31946,20q13.12,S6-p,15718709,human,28.08,,__MtMDksELVQkAk,...,0,6,P31946_6,P31946,SER,6.0,,undefined,6.0,P31946
2,YWHAB,14-3-3 beta,P31946,20q13.12,Y21-p,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,0,21,P31946_21,P31946,TYR,21.0,11.100927,protonated,21.0,P31946
3,YWHAB,14-3-3 beta,P31946,20q13.12,T32-p,23077803,human,28.08,14-3-3,AAAMkAVtEQGHELs,...,0,32,P31946_32,P31946,THR,32.0,,undefined,32.0,P31946
4,YWHAB,14-3-3 beta,P31946,20q13.12,S39-p,27442700,human,28.08,14-3-3,tEQGHELsNEERNLL,...,0,39,P31946_39,P31946,SER,39.0,,undefined,39.0,P31946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436140,ZHX1,ZHX1,Q9UKY1,8q24.13,S450-gl,14703720,human,98.10,,ATAAVPTsQSVkHET,...,0,450,Q9UKY1_450,Q9UKY1,SER,450.0,,undefined,450.0,Q9UKY1
436141,ZNF281,ZNF281,Q9Y2X9,1q32.1,S891-gl,14703723,human,96.91,,TRVktPTsQSYR___,...,0,891,Q9Y2X9_891,Q9Y2X9,SER,891.0,,undefined,891.0,Q9Y2X9
436142,ZNF609,ZNF609,O15014,15q22.31,S1196-gl,14703726,human,151.19,,SDCKLPTsEESRLGS,...,0,1196,O15014_1196,O15014,SER,1196.0,,undefined,1196.0,O15014
436143,ZYX,Zyxin,Q15942,7q34,S169-gl,1876610700,human,61.28,,DPFkARVssGyVPPP,...,0,169,Q15942_169,Q15942,SER,169.0,,undefined,169.0,Q15942


## Interfaces Data
The procedure: 
Similar to the pockets data, 

In [29]:
interfaces_data

Unnamed: 0,interaction_id,pdockq,uniprot_id1,uniprot_id2,chain1,chain2,ifresid1,ifresid2,sources,n_references,pdb
0,O75106_Q16853,0.74,O75106,Q16853,A,B,"R169,A203,A204,V205,H206,L212,R213,W220,N226,I...","P39,V209,L218,Q219,W226,N232,I233,S234,G235,A2...","BioGRID,humap,intact,string",2,O75106/O75106_Q16853.pdb
1,Q15118_Q15118,0.73,Q15118,Q15118,A,B,"S53,P54,P56,Y179,D182,R183,M186,L255,A257,H304...","S53,P54,P56,Y179,D182,R183,M186,E253,L255,A257...","BioGRID,intact",2,Q15118/Q15118_Q15118.pdb
2,P11142_Q92598,0.73,P11142,Q92598,A,B,"K25,E27,I28,A30,N31,D32,Q33,G34,R36,E48,L50,D5...","R19,A27,N28,E29,F30,S31,R33,N54,T58,Y184,R261,...","BioGRID,corum,humap,intact,otar,string,xlinkdb",9,P11142/P11142_Q92598.pdb
3,Q13326_Q16585,0.73,Q13326,Q16585,A,B,"V40,L41,L43,L44,L47,V48,N50,L51,T54,I55,L58,F6...","V68,I69,L71,L72,L75,A76,I78,N79,I82,I86,M100,F...","corum,otar,string",0,Q13326/Q13326_Q16585.pdb
4,Q13326_Q92629,0.73,Q13326,Q92629,A,B,"K33,L36,Y37,V40,L41,L43,L44,L47,V48,N50,L51,T5...","R30,K31,C33,L34,F37,V38,L40,L41,L44,I45,V47,N4...","corum,string",0,Q13326/Q13326_Q92629.pdb
...,...,...,...,...,...,...,...,...,...,...,...
486094,P23193_Q92889,0.00,,,,,,,otar,0,
486095,P23193_Q92541,0.00,,,,,,,"BioGRID,intact,otar,string",1,
486096,P23193_Q8WX92,0.00,,,,,,,"otar,string",0,
486097,P23193_Q8WVC0,0.00,,,,,,,"BioGRID,intact,otar,string",2,


In [30]:
interfaces_data['interaction_id'].nunique()
# all of the interfaces are unique (no duplicates and no switching)

486099

In [31]:
# making a smaller test dataset
test_psp = full_noduplicates_data.sample(n = 3)
test_psp['closest_interface'] = "NaN"
test_psp['inside_interface'] = 0
test_psp['distance_from_interface'] = "NaN"
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_interface,inside_interface,distance_from_interface
8923,AIDA,AIDA,Q96BJ3,1q41,Y296-p,34365066,human,35.02,Aida_C2,LLTKKPLyLHLHQTL,...,Q96BJ3,TYR,296.0,10.268782,protonated,296.0,Q96BJ3,,0,
137296,KMT2E,MLL5,Q8IZD2,7q22.3,S795-p,28791277,human,204.96,,KHYIrFTsPFLsEKR,...,Q8IZD2,SER,795.0,,undefined,795.0,Q8IZD2,,0,
38055,SPICE1,CCDC52,Q8N0Z3,3q13.2,S640-p,10941554,human,96.26,,SNTQQSRsPTFsEEL,...,Q8N0Z3,SER,640.0,,undefined,640.0,Q8N0Z3,,0,


In [32]:
unique_uniprots = test_psp['uniprot_id'].unique() # get all of the unique uniprots

# for each unique uniprotID...
for uniprot in unique_uniprots:
    # isolate to psp and interface in each uniprot
    psp_only_uniprot = test_psp[test_psp.uniprot_id == uniprot]
    interface_only_uniprot = interfaces_data.loc[(interfaces_data['uniprot_id1'] == uniprot) | (interfaces_data['uniprot_id2'] == uniprot)] # isolate to uniprot in either 1 or 2


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file

            for interface_index in interface_only_uniprot.index : # get all the residues in all of the interfaces 
                if pd.notna(interface_only_uniprot.loc[interface_index,'ifresid1']) & pd.notna(interface_only_uniprot.loc[interface_index,'ifresid1']):
                    if interfaces_data.loc[interface_index,'uniprot_id1'] == uniprot:
                        interface_residues = interface_only_uniprot.loc[interface_index,'ifresid1']
                    elif interfaces_data.loc[interface_index,'uniprot_id2'] == uniprot:
                        interface_residues = interface_only_uniprot.loc[interface_index,'ifresid2']
                    
                    # check if it's inside of a interface
                    print(interface_residues)
                    interface_residues = interface_residues.split(",") # format the interface_residues because it's a string
                    interface_residues = [e[1:] for e in interface_residues] # remove the first letter from each bc it includes residue type
                    #print(interface_residues)
                    if residue_num in interface_residues:
                        print("found a PSP in an interface")
                        test_psp.loc[phosphosite_row_index,'inside_interface'] = 1 # if residue is in the interface, put 1 in the inside interface column
                        test_psp.loc[phosphosite_row_index,'closest_interface'] = interface_only_uniprot.loc[interface_index,'interaction_id'] # put unique interfaceID in closest pocket
                        test_psp.loc[phosphosite_row_index,'distance_from_interface'] = 0 
                        break # break because you don't want to contiue looking for interfaces (and therefore overwrite the inside interface and closest interface)
                    
                    if test_psp.loc[phosphosite_row_index,'inside_interface'] == 0: # if the phosphosite isn't in any pockets
                        input_struct = ppdb.df['ATOM']
                        new_dist = find_mean_distances(input_struct, residue_num, interface_residues)
                        #print("we found distance", new_dist)
                        if residue_num:
                            if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                                test_psp.loc[phosphosite_row_index,'closest_interface'] = interface_only_uniprot.loc[interface_index,'interaction_id'] # put unique interfaceID in closest interface
                                test_psp.loc[phosphosite_row_index,'distance_from_interface'] = new_dist # replace distance_from_interface with min_dist
                                min_dist = new_dist 
                                print("added smallest distance:", min_dist)
                                print("the interface is:", interface_only_uniprot.loc[interface_index,'interaction_id'])
                
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            test_psp.loc[phosphosite_row_index,'inside_interface'] = 'NaN' 
            test_psp.loc[phosphosite_row_index,'closest_interface'] = 'NaN' 
            test_psp.loc[phosphosite_row_index,'distance_from_interface'] = 'NaN'
            

        

                

name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-Q96BJ3-F1-model_v4.pdb']
A14,R17,R18,D21,F22,S24,W25,G26,F260,E262,D264,E265,K267,P268,V272,I273,E274,Y276,L287,L289,K292,K293,Y296
added smallest distance: 16.57846538332096
the interface is: P83881_Q96BJ3
Q27,E30,Y276,R284,K285,K286,L287,Q288,L289,K292
added smallest distance: 16.41964098906829
the interface is: Q96BJ3_Q9NSP4
Q10,R11,A14,R17,R18,D21,F22,S24,Q27,E34,K247,P248,K249,K250,R251,F252,K278
Q183,I185
T155,L157,P158,L160,P161,T190,G198,I199,D200,L201,T202,P203,V204,Q205,D206,Q227,K244
R17,R18,D21,F22,S24,V29,E30,L88,E89,K92,P279
R18,D21,F22,S24,L28,E30,G182,Q183,C184,I185,K213
W25,G26,Q27
F260,E262,D264,E265,E274,Y276,L287,L289
added smallest distance: 12.897276620488157
the interface is: Q8N5S1_Q96BJ3
V29,D33,I37,R40,T155,T202,P203,V204,D206,T253
F260,M261,E265,I273,E274,Y276,L287,L289,T291,K292,K293,Y296
added smallest distance: 9.49601024695765
the interface is: Q96BJ3_Q96BK5
E274,L289
added smallest d

In [33]:
test_psp

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,uniprotID,AA,res_number,pK,state,position,uniprot_id,closest_interface,inside_interface,distance_from_interface
8923,AIDA,AIDA,Q96BJ3,1q41,Y296-p,34365066,human,35.02,Aida_C2,LLTKKPLyLHLHQTL,...,Q96BJ3,TYR,296.0,10.268782,protonated,296.0,Q96BJ3,Q96BJ3_Q96D03,0,7.99081
137296,KMT2E,MLL5,Q8IZD2,7q22.3,S795-p,28791277,human,204.96,,KHYIrFTsPFLsEKR,...,Q8IZD2,SER,795.0,,undefined,795.0,Q8IZD2,,0,
38055,SPICE1,CCDC52,Q8N0Z3,3q13.2,S640-p,10941554,human,96.26,,SNTQQSRsPTFsEEL,...,Q8N0Z3,SER,640.0,,undefined,640.0,Q8N0Z3,Q8N0Z3_Q9UL42,0,51.883408


In [34]:
uniprot = "P05114"
interface_only_uniprot = interfaces_data.loc[(interfaces_data['uniprot_id1'] == uniprot) | (interfaces_data['uniprot_id2'] == uniprot)] # isolate to uniprot in either 1 or 2
interface_only_uniprot

Unnamed: 0,interaction_id,pdockq,uniprot_id1,uniprot_id2,chain1,chain2,ifresid1,ifresid2,sources,n_references,pdb
6128,P05114_P18858,0.66,P05114,P18858,A,B,,,"otar,string",0,P05114/P05114_P18858.pdb
23645,P05114_P17612,0.51,P05114,P17612,A,B,"R19,R20,A22,R23,L24,S25","E128,F130,S131,R134,K169,P170,E171,T202,E204,Y...",otar,0,P05114/P05114_P17612.pdb
39160,P05114_Q9UNP9,0.43,P05114,Q9UNP9,A,B,S25,"P28,R191,F196,W257,L258",otar,0,P05114/P05114_Q9UNP9.pdb
61813,P05114_P63104,0.34,P05114,P63104,A,B,"A63,E64","K49,R56,R127,Y128,L172,N173,V176,L220,N224",BioGRID,2,P05114/P05114_P63104.pdb
76639,P05114_P12004,0.3,P05114,P12004,A,B,"R19,R20,S21,A22,R23,L24","M40,S43,H44,V45,S46,L47,V159,T206,Y211,P234,Y2...","BioGRID,otar",1,P05114/P05114_P12004.pdb
103746,P05114_P49005,0.24,P05114,P49005,A,B,"R23,L24","Y54,A55,L58,T96,F98,E136,D137,E138,T155,D380",otar,0,P05114/P05114_P49005.pdb
111857,P05114_P50613,0.22,P05114,P50613,A,B,T81,"D137,L158,Q172,V173,V174,T175,R179",otar,0,P05114/P05114_P50613.pdb
125431,P05114_Q13216,0.19,P05114,Q13216,A,B,"S21,A22,R23,L24","R57,Y58,L70,V87,C88,W107,T112,G113,T129,N130,T...","BioGRID,otar",1,P05114/P05114_Q13216.pdb
155431,P05114_Q6ZYL4,0.15,P05114,Q6ZYL4,A,B,"L24,S25,A26","V8,L9,I10,E11,C12,V50,Q54,V57,G58,M61",otar,0,P05114/P05114_Q6ZYL4.pdb
162136,P05114_P07737,0.14,P05114,P07737,A,B,,,BioGRID,1,P05114/P05114_P07737.pdb


In [35]:
interface_res_example = interface_only_uniprot.loc[0,'ifresid1']
interface_res_example

KeyError: 0

In [None]:
interface_res_example = interface_res_example.split(",")
list_of_interface_res = [e[1:] for e in interface_res_example]

In [None]:
type(list_of_interface_res[1])

str

In [None]:
list_of_interface_res.append("583")

In [None]:
list_of_interface_res

['169',
 '203',
 '204',
 '205',
 '206',
 '212',
 '213',
 '220',
 '226',
 '227',
 '228',
 '229',
 '230',
 '231',
 '232',
 '233',
 '242',
 '257',
 '264',
 '291',
 '292',
 '293',
 '295',
 '296',
 '297',
 '298',
 '299',
 '300',
 '301',
 '302',
 '303',
 '304',
 '305',
 '306',
 '307',
 '308',
 '309',
 '310',
 '311',
 '312',
 '313',
 '314',
 '316',
 '365',
 '366',
 '367',
 '368',
 '369',
 '370',
 '371',
 '374',
 '375',
 '377',
 '390',
 '391',
 '392',
 '393',
 '394',
 '395',
 '396',
 '397',
 '398',
 '399',
 '400',
 '426',
 '427',
 '428',
 '429',
 '430',
 '431',
 '432',
 '433',
 '434',
 '435',
 '436',
 '437',
 '438',
 '439',
 '440',
 '441',
 '442',
 '443',
 '444',
 '445',
 '446',
 '447',
 '448',
 '449',
 '450',
 '452',
 '457',
 '459',
 '461',
 '463',
 '464',
 '465',
 '466',
 '468',
 '470',
 '474',
 '476',
 '478',
 '484',
 '486',
 '487',
 '488',
 '501',
 '502',
 '503',
 '504',
 '505',
 '506',
 '507',
 '508',
 '509',
 '515',
 '516',
 '517',
 '538',
 '539',
 '540',
 '541',
 '542',
 '547',
 '548',


In [None]:
# shows that it can find if it's inside of a pocket or interface
psp_only_uniprot = full_noduplicates_data[full_noduplicates_data.uniprot_id == uniprot]
psp_only_uniprot
res_num = psp_only_uniprot.loc[15257,'RES_NUM']
type(res_num)
#res_num in list_of_interface_res

str